yury-classifier 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,303 @@
1
+ module Classifier
2
+ class Base
3
+
4
+ def initialize(options = {})
5
+ options.reverse_merge!(:language => 'en')
6
+ options.reverse_merge!(:encoding => 'UTF_8')
7
+
8
+ @options = options
9
+ end
10
+
11
+ def prepare_category_name val
12
+ val.to_s.gsub("_"," ").capitalize.intern
13
+ end
14
+
15
+ # Removes common punctuation symbols, returning a new string.
16
+ # E.g.,
17
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
18
+ # => "Hello greetings with braces "
19
+ def without_punctuation str
20
+ str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
21
+ end
22
+
23
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
24
+ # interned, and indexes to its frequency in the document.
25
+ def word_hash str
26
+ word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
27
+ end
28
+
29
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
30
+ def clean_word_hash str
31
+ word_hash_for_words str.gsub(/[^\w\s]/,"").split
32
+ end
33
+
34
+ private
35
+
36
+ def word_hash_for_words(words)
37
+ stemmer = Lingua::Stemmer.new(@options)
38
+ d = Hash.new
39
+ skip_words = SKIP_WORDS[@options[:language]] || []
40
+ words.each do |word|
41
+ word.mb_chars.downcase! if word =~ /[\w]+/
42
+ key = stemmer.stem(word).intern
43
+ if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2
44
+ d[key] ||= 0
45
+ d[key] += 1
46
+ end
47
+ end
48
+ return d
49
+ end
50
+
51
+ EN_CORPUS_SKIP_WORDS = [
52
+ "a",
53
+ "again",
54
+ "all",
55
+ "along",
56
+ "are",
57
+ "also",
58
+ "an",
59
+ "and",
60
+ "as",
61
+ "at",
62
+ "but",
63
+ "by",
64
+ "came",
65
+ "can",
66
+ "cant",
67
+ "couldnt",
68
+ "did",
69
+ "didn",
70
+ "didnt",
71
+ "do",
72
+ "doesnt",
73
+ "dont",
74
+ "ever",
75
+ "first",
76
+ "from",
77
+ "have",
78
+ "her",
79
+ "here",
80
+ "him",
81
+ "how",
82
+ "i",
83
+ "if",
84
+ "in",
85
+ "into",
86
+ "is",
87
+ "isnt",
88
+ "it",
89
+ "itll",
90
+ "just",
91
+ "last",
92
+ "least",
93
+ "like",
94
+ "most",
95
+ "my",
96
+ "new",
97
+ "no",
98
+ "not",
99
+ "now",
100
+ "of",
101
+ "on",
102
+ "or",
103
+ "should",
104
+ "sinc",
105
+ "so",
106
+ "some",
107
+ "th",
108
+ "than",
109
+ "this",
110
+ "that",
111
+ "the",
112
+ "their",
113
+ "then",
114
+ "those",
115
+ "to",
116
+ "told",
117
+ "too",
118
+ "true",
119
+ "try",
120
+ "until",
121
+ "url",
122
+ "us",
123
+ "were",
124
+ "when",
125
+ "whether",
126
+ "while",
127
+ "with",
128
+ "within",
129
+ "yes",
130
+ "you",
131
+ "youll",
132
+ ]
133
+
134
+ # http://snowball.tartarus.org/algorithms/russian/stop.txt
135
+ RU_CORPUS_SKIP_WORDS = [
136
+ "и", # and
137
+ "в", # in/into
138
+ "во", # alternative form
139
+ "не", # not
140
+ "что", # what/that
141
+ "он", # he
142
+ "на", # on/onto
143
+ "я", # i
144
+ "с", # from
145
+ "со", # alternative form
146
+ "как", # how
147
+ "а", # milder form of `no' (but)
148
+ "то", # conjunction and form of `that'
149
+ "все", # all
150
+ "она", # she
151
+ "так", # so, thus
152
+ "его", # him
153
+ "но", # but
154
+ "да", # yes/and
155
+ "ты", # thou
156
+ "к", # towards, by
157
+ "у", # around, chez
158
+ "же", # intensifier particle
159
+ "вы", # you
160
+ "за", # beyond, behind
161
+ "бы", # conditional/subj. particle
162
+ "по", # up to, along
163
+ "только", # only
164
+ "ее", # her
165
+ "мне", # to me
166
+ "было", # it was
167
+ "вот", # here is/are, particle
168
+ "от", # away from
169
+ "меня", # me
170
+ "еще", # still, yet, more
171
+ "нет", # no, there isnt/arent
172
+ "о", # about
173
+ "из", # out of
174
+ "ему", # to him
175
+ "теперь", # now
176
+ "когда", # when
177
+ "даже", # even
178
+ "ну", # so, well
179
+ "вдруг", # suddenly
180
+ "ли", # interrogative particle
181
+ "если", # if
182
+ "уже", # already, but homonym of `narrower'
183
+ "или", # or
184
+ "ни", # neither
185
+ "быть", # to be
186
+ "был", # he was
187
+ "него", # prepositional form of его
188
+ "до", # up to
189
+ "вас", # you accusative
190
+ "нибудь", # indef. suffix preceded by hyphen
191
+ "опять", # again
192
+ "уж", # already, but homonym of `adder'
193
+ "вам", # to you
194
+ "сказал", # he said
195
+ "ведь", # particle `after all'
196
+ "там", # there
197
+ "потом", # then
198
+ "себя", # oneself
199
+ "ничего", # nothing
200
+ "ей", # to her
201
+ "может", # usually with `быть' as `maybe'
202
+ "они", # they
203
+ "тут", # here
204
+ "где", # where
205
+ "есть", # there is/are
206
+ "надо", # got to, must
207
+ "ней", # prepositional form of ей
208
+ "для", # for
209
+ "мы", # we
210
+ "тебя", # thee
211
+ "их", # them, their
212
+ "чем", # than
213
+ "была", # she was
214
+ "сам", # self
215
+ "чтоб", # in order to
216
+ "без", # without
217
+ "будто", # as if
218
+ "человек", # man, person, one
219
+ "чего", # genitive form of `what'
220
+ "раз", # once
221
+ "тоже", # also
222
+ "себе", # to oneself
223
+ "под", # beneath
224
+ "жизнь", # life
225
+ "будет", # will be
226
+ "ж", # short form of intensifer particle `же'
227
+ "тогда", # then
228
+ "кто", # who
229
+ "этот", # this
230
+ "говорил", # was saying
231
+ "того", # genitive form of `that'
232
+ "потому", # for that reason
233
+ "этого", # genitive form of `this'
234
+ "какой", # which
235
+ "совсем", # altogether
236
+ "ним", # prepositional form of `его', `они'
237
+ "здесь", # here
238
+ "этом", # prepositional form of `этот'
239
+ "один", # one
240
+ "почти", # almost
241
+ "мой", # my
242
+ "тем", # instrumental/dative plural of `тот', `то'
243
+ "чтобы", # full form of `in order that'
244
+ "нее", # her (acc.)
245
+ "кажется", # it seems
246
+ "сейчас", # now
247
+ "были", # they were
248
+ "куда", # where to
249
+ "зачем", # why
250
+ "сказать", # to say
251
+ "всех", # all (acc., gen. preposn. plural)
252
+ "никогда", # never
253
+ "сегодня", # today
254
+ "можно", # possible, one can
255
+ "при", # by
256
+ "наконец", # finally
257
+ "два", # two
258
+ "об", # alternative form of `о', about
259
+ "другой", # another
260
+ "хоть", # even
261
+ "после", # after
262
+ "над", # above
263
+ "больше", # more
264
+ "тот", # that one (masc.)
265
+ "через", # across, in
266
+ "эти", # these
267
+ "нас", # us
268
+ "про", # about
269
+ "всего", # in all, only, of all
270
+ "них", # prepositional form of `они' (they)
271
+ "какая", # which, feminine
272
+ "много", # lots
273
+ "разве", # interrogative particle
274
+ "сказала", # she said
275
+ "три", # three
276
+ "эту", # this, acc. fem. sing.
277
+ "моя", # my, feminine
278
+ "впрочем", # moreover, besides
279
+ "хорошо", # good
280
+ "свою", # ones own, acc. fem. sing.
281
+ "этой", # oblique form of `эта', fem. `this'
282
+ "перед", # in front of
283
+ "иногда", # sometimes
284
+ "лучше", # better
285
+ "чуть", # a little
286
+ "том", # preposn. form of `that one'
287
+ "нельзя", # one must not
288
+ "такой", # such a one
289
+ "им", # to them
290
+ "более", # more
291
+ "всегда", # always
292
+ "конечно", # of course
293
+ "всю", # acc. fem. sing of `all'
294
+ "между", # between
295
+ ]
296
+
297
+ SKIP_WORDS = {
298
+ 'en' => EN_CORPUS_SKIP_WORDS,
299
+ 'ru' => RU_CORPUS_SKIP_WORDS
300
+ }
301
+
302
+ end
303
+ end
@@ -0,0 +1,134 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ class Bayes < Classifier::Base
8
+
9
+ # The class can be created with one or more categories, each of which will be
10
+ # initialized and given a training method. E.g.,
11
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam']
12
+ # you can specify language and encoding parameters for stemmer
13
+ # (default values - :language => 'en', :encoding => 'UTF_8')
14
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam'], :language => 'ru'
15
+ def initialize(options = {})
16
+ @categories = Hash.new
17
+ options.reverse_merge!(:categories => [])
18
+ options[:categories].each { |category| @categories[prepare_category_name(category)] = Hash.new }
19
+ @total_words = 0
20
+ super
21
+ end
22
+
23
+ #
24
+ # Provides a general training method for all categories specified in Bayes#new
25
+ # For example:
26
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
27
+ # b.train :this, "This text"
28
+ # b.train "that", "That text"
29
+ # b.train "The other", "The other text"
30
+ def train(category, text)
31
+ category = prepare_category_name(category)
32
+ word_hash(text).each do |word, count|
33
+ @categories[category][word] ||= 0
34
+ @categories[category][word] += count
35
+ @total_words += count
36
+ end
37
+ end
38
+
39
+ #
40
+ # Provides a untraining method for all categories specified in Bayes#new
41
+ # Be very careful with this method.
42
+ #
43
+ # For example:
44
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
45
+ # b.train :this, "This text"
46
+ # b.untrain :this, "This text"
47
+ def untrain(category, text)
48
+ category = prepare_category_name(category)
49
+ word_hash(text).each do |word, count|
50
+ if @total_words >= 0
51
+ orig = @categories[category][word]
52
+ @categories[category][word] ||= 0
53
+ @categories[category][word] -= count
54
+ if @categories[category][word] <= 0
55
+ @categories[category].delete(word)
56
+ count = orig
57
+ end
58
+ @total_words -= count
59
+ end
60
+ end
61
+ end
62
+
63
+ #
64
+ # Returns the scores in each category the provided +text+. E.g.,
65
+ # b.classifications "I hate bad words and you"
66
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
67
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
68
+ def classifications(text)
69
+ score = Hash.new
70
+ @categories.each do |category, category_words|
71
+ score[category.to_s] = 0
72
+ total = category_words.values.sum
73
+ word_hash(text).each do |word, count|
74
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
75
+ score[category.to_s] += Math.log(s/total.to_f)
76
+ end
77
+ end
78
+ return score
79
+ end
80
+
81
+ #
82
+ # Returns the classification of the provided +text+, which is one of the
83
+ # categories given in the initializer. E.g.,
84
+ # b.classify "I hate bad words and you"
85
+ # => 'Uninteresting'
86
+ def classify(text)
87
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
88
+ end
89
+
90
+ #
91
+ # Provides training and untraining methods for the categories specified in Bayes#new
92
+ # For example:
93
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
94
+ # b.train_this "This text"
95
+ # b.train_that "That text"
96
+ # b.untrain_that "That text"
97
+ # b.train_the_other "The other text"
98
+ def method_missing(name, *args)
99
+ category = prepare_category_name(name.to_s.gsub(/(un)?train_([\w]+)/, '\2'))
100
+ if @categories.has_key? category
101
+ args.each { |text| eval("#{$1}train(category, text)") }
102
+ elsif name.to_s =~ /(un)?train_([\w]+)/
103
+ raise StandardError, "No such category: #{category}"
104
+ else
105
+ super #raise StandardError, "No such method: #{name}"
106
+ end
107
+ end
108
+
109
+ #
110
+ # Provides a list of category names
111
+ # For example:
112
+ # b.categories
113
+ # => ['This', 'That', 'the_other']
114
+ def categories # :nodoc:
115
+ @categories.keys.collect {|c| c.to_s}
116
+ end
117
+
118
+ #
119
+ # Allows you to add categories to the classifier.
120
+ # For example:
121
+ # b.add_category "Not spam"
122
+ #
123
+ # WARNING: Adding categories to a trained classifier will
124
+ # result in an undertrained category that will tend to match
125
+ # more criteria than the trained selective categories. In short,
126
+ # try to initialize your categories at initialization.
127
+ def add_category(category)
128
+ @categories[prepare_category_name(category)] = Hash.new
129
+ end
130
+
131
+ alias append_category add_category
132
+ end
133
+
134
+ end
@@ -0,0 +1,100 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Vector
10
+ def magnitude
11
+ sumsqs = 0.0
12
+ self.size.times do |i|
13
+ sumsqs += self[i] ** 2.0
14
+ end
15
+ Math.sqrt(sumsqs)
16
+ end
17
+ def normalize
18
+ nv = []
19
+ mag = self.magnitude
20
+ self.size.times do |i|
21
+
22
+ nv << (self[i] / mag)
23
+
24
+ end
25
+ Vector[*nv]
26
+ end
27
+ end
28
+
29
+ class Matrix
30
+ def Matrix.diag(s)
31
+ Matrix.diagonal(*s)
32
+ end
33
+
34
+ alias :trans :transpose
35
+
36
+ def SV_decomp(maxSweeps = 20)
37
+ if self.row_size >= self.column_size
38
+ q = self.trans * self
39
+ else
40
+ q = self * self.trans
41
+ end
42
+
43
+ qrot = q.dup
44
+ v = Matrix.identity(q.row_size)
45
+ azrot = nil
46
+ mzrot = nil
47
+ cnt = 0
48
+ s_old = nil
49
+ mu = nil
50
+
51
+ while true do
52
+ cnt += 1
53
+ for row in (0...qrot.row_size-1) do
54
+ for col in (1..qrot.row_size-1) do
55
+ next if row == col
56
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
57
+ hcos = Math.cos(h)
58
+ hsin = Math.sin(h)
59
+ mzrot = Matrix.identity(qrot.row_size)
60
+ mzrot[row,row] = hcos
61
+ mzrot[row,col] = -hsin
62
+ mzrot[col,row] = hsin
63
+ mzrot[col,col] = hcos
64
+ qrot = mzrot.trans * qrot * mzrot
65
+ v = v * mzrot
66
+ end
67
+ end
68
+ s_old = qrot.dup if cnt == 1
69
+ sum_qrot = 0.0
70
+ if cnt > 1
71
+ qrot.row_size.times do |r|
72
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
73
+ end
74
+ s_old = qrot.dup
75
+ end
76
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
77
+ end # of do while true
78
+ s = []
79
+ qrot.row_size.times do |r|
80
+ s << Math.sqrt(qrot[r,r])
81
+ end
82
+ #puts "cnt = #{cnt}"
83
+ if self.row_size >= self.column_size
84
+ mu = self * v * Matrix.diagonal(*s).inverse
85
+ return [mu, v, s]
86
+ else
87
+ puts v.row_size
88
+ puts v.column_size
89
+ puts self.row_size
90
+ puts self.column_size
91
+ puts s.size
92
+
93
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
94
+ return [mu, v, s]
95
+ end
96
+ end
97
+ def []=(i,j,val)
98
+ @rows[i][j] = val
99
+ end
100
+ end
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,72 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ # This is an internal data structure class for the LSI node. Save for
8
+ # raw_vector_with, it should be fairly straightforward to understand.
9
+ # You should never have to use it directly.
10
+ class ContentNode
11
+ attr_accessor :raw_vector, :raw_norm,
12
+ :lsi_vector, :lsi_norm,
13
+ :categories
14
+
15
+ attr_reader :word_hash
16
+ # If text_proc is not specified, the source will be duck-typed
17
+ # via source.to_s
18
+ def initialize( word_hash, *categories )
19
+ @categories = categories || []
20
+ @word_hash = word_hash
21
+ end
22
+
23
+ # Use this to fetch the appropriate search vector.
24
+ def search_vector
25
+ @lsi_vector || @raw_vector
26
+ end
27
+
28
+ # Use this to fetch the appropriate search vector in normalized form.
29
+ def search_norm
30
+ @lsi_norm || @raw_norm
31
+ end
32
+
33
+ # Creates the raw vector out of word_hash using word_list as the
34
+ # key for mapping the vector space.
35
+ def raw_vector_with( word_list )
36
+ if $GSL
37
+ vec = GSL::Vector.alloc(word_list.size)
38
+ else
39
+ vec = Array.new(word_list.size, 0)
40
+ end
41
+
42
+ @word_hash.each_key do |word|
43
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
44
+ end
45
+
46
+ # Perform the scaling transform
47
+ total_words = vec.sum.to_f
48
+
49
+ # Perform first-order association transform if this vector has more
50
+ # than one word in it.
51
+ if total_words > 1.0
52
+ weighted_total = 0.0
53
+ vec.each do |term|
54
+ if ( term > 0 )
55
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
+ end
57
+ end
58
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
59
+ end
60
+
61
+ if $GSL
62
+ @raw_norm = vec.normalize
63
+ @raw_vector = vec
64
+ else
65
+ @raw_norm = Vector[*vec].normalize
66
+ @raw_vector = Vector[*vec]
67
+ end
68
+ end
69
+
70
+ end
71
+
72
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ class String
6
+ def summary( count=10, separator=" [...] " )
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
+
10
+ def paragraph_summary( count=1, separator=" [...] " )
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
+
14
+ def split_sentences
15
+ split /(\.|\!|\?)/ # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new :auto_rebuild => false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+
9
+ class WordList
10
+ def initialize
11
+ @location_table = Hash.new
12
+ end
13
+
14
+ # Adds a word (if it is new) and assigns it a unique dimension.
15
+ def add_word(word)
16
+ term = word
17
+ @location_table[term] = @location_table.size unless @location_table[term]
18
+ end
19
+
20
+ # Returns the dimension of the word or nil if the word is not in the space.
21
+ def [](lookup)
22
+ term = lookup
23
+ @location_table[term]
24
+ end
25
+
26
+ def word_for_index(ind)
27
+ @location_table.invert[ind]
28
+ end
29
+
30
+ # Returns the number of words mapped.
31
+ def size
32
+ @location_table.size
33
+ end
34
+
35
+ end
36
+ end