yury-classifier 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,303 @@
1
+ module Classifier
2
+ class Base
3
+
4
+ def initialize(options = {})
5
+ options.reverse_merge!(:language => 'en')
6
+ options.reverse_merge!(:encoding => 'UTF_8')
7
+
8
+ @options = options
9
+ end
10
+
11
+ def prepare_category_name val
12
+ val.to_s.gsub("_"," ").capitalize.intern
13
+ end
14
+
15
+ # Removes common punctuation symbols, returning a new string.
16
+ # E.g.,
17
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
18
+ # => "Hello greetings with braces "
19
+ def without_punctuation str
20
+ str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
21
+ end
22
+
23
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
24
+ # interned, and indexes to its frequency in the document.
25
+ def word_hash str
26
+ word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
27
+ end
28
+
29
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
30
+ def clean_word_hash str
31
+ word_hash_for_words str.gsub(/[^\w\s]/,"").split
32
+ end
33
+
34
+ private
35
+
36
+ def word_hash_for_words(words)
37
+ stemmer = Lingua::Stemmer.new(@options)
38
+ d = Hash.new
39
+ skip_words = SKIP_WORDS[@options[:language]] || []
40
+ words.each do |word|
41
+ word.mb_chars.downcase! if word =~ /[\w]+/
42
+ key = stemmer.stem(word).intern
43
+ if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2
44
+ d[key] ||= 0
45
+ d[key] += 1
46
+ end
47
+ end
48
+ return d
49
+ end
50
+
51
+ EN_CORPUS_SKIP_WORDS = [
52
+ "a",
53
+ "again",
54
+ "all",
55
+ "along",
56
+ "are",
57
+ "also",
58
+ "an",
59
+ "and",
60
+ "as",
61
+ "at",
62
+ "but",
63
+ "by",
64
+ "came",
65
+ "can",
66
+ "cant",
67
+ "couldnt",
68
+ "did",
69
+ "didn",
70
+ "didnt",
71
+ "do",
72
+ "doesnt",
73
+ "dont",
74
+ "ever",
75
+ "first",
76
+ "from",
77
+ "have",
78
+ "her",
79
+ "here",
80
+ "him",
81
+ "how",
82
+ "i",
83
+ "if",
84
+ "in",
85
+ "into",
86
+ "is",
87
+ "isnt",
88
+ "it",
89
+ "itll",
90
+ "just",
91
+ "last",
92
+ "least",
93
+ "like",
94
+ "most",
95
+ "my",
96
+ "new",
97
+ "no",
98
+ "not",
99
+ "now",
100
+ "of",
101
+ "on",
102
+ "or",
103
+ "should",
104
+ "sinc",
105
+ "so",
106
+ "some",
107
+ "th",
108
+ "than",
109
+ "this",
110
+ "that",
111
+ "the",
112
+ "their",
113
+ "then",
114
+ "those",
115
+ "to",
116
+ "told",
117
+ "too",
118
+ "true",
119
+ "try",
120
+ "until",
121
+ "url",
122
+ "us",
123
+ "were",
124
+ "when",
125
+ "whether",
126
+ "while",
127
+ "with",
128
+ "within",
129
+ "yes",
130
+ "you",
131
+ "youll",
132
+ ]
133
+
134
+ # http://snowball.tartarus.org/algorithms/russian/stop.txt
135
+ RU_CORPUS_SKIP_WORDS = [
136
+ "и", # and
137
+ "в", # in/into
138
+ "во", # alternative form
139
+ "не", # not
140
+ "что", # what/that
141
+ "он", # he
142
+ "на", # on/onto
143
+ "я", # i
144
+ "с", # from
145
+ "со", # alternative form
146
+ "как", # how
147
+ "а", # milder form of `no' (but)
148
+ "то", # conjunction and form of `that'
149
+ "все", # all
150
+ "она", # she
151
+ "так", # so, thus
152
+ "его", # him
153
+ "но", # but
154
+ "да", # yes/and
155
+ "ты", # thou
156
+ "к", # towards, by
157
+ "у", # around, chez
158
+ "же", # intensifier particle
159
+ "вы", # you
160
+ "за", # beyond, behind
161
+ "бы", # conditional/subj. particle
162
+ "по", # up to, along
163
+ "только", # only
164
+ "ее", # her
165
+ "мне", # to me
166
+ "было", # it was
167
+ "вот", # here is/are, particle
168
+ "от", # away from
169
+ "меня", # me
170
+ "еще", # still, yet, more
171
+ "нет", # no, there isnt/arent
172
+ "о", # about
173
+ "из", # out of
174
+ "ему", # to him
175
+ "теперь", # now
176
+ "когда", # when
177
+ "даже", # even
178
+ "ну", # so, well
179
+ "вдруг", # suddenly
180
+ "ли", # interrogative particle
181
+ "если", # if
182
+ "уже", # already, but homonym of `narrower'
183
+ "или", # or
184
+ "ни", # neither
185
+ "быть", # to be
186
+ "был", # he was
187
+ "него", # prepositional form of его
188
+ "до", # up to
189
+ "вас", # you accusative
190
+ "нибудь", # indef. suffix preceded by hyphen
191
+ "опять", # again
192
+ "уж", # already, but homonym of `adder'
193
+ "вам", # to you
194
+ "сказал", # he said
195
+ "ведь", # particle `after all'
196
+ "там", # there
197
+ "потом", # then
198
+ "себя", # oneself
199
+ "ничего", # nothing
200
+ "ей", # to her
201
+ "может", # usually with `быть' as `maybe'
202
+ "они", # they
203
+ "тут", # here
204
+ "где", # where
205
+ "есть", # there is/are
206
+ "надо", # got to, must
207
+ "ней", # prepositional form of ей
208
+ "для", # for
209
+ "мы", # we
210
+ "тебя", # thee
211
+ "их", # them, their
212
+ "чем", # than
213
+ "была", # she was
214
+ "сам", # self
215
+ "чтоб", # in order to
216
+ "без", # without
217
+ "будто", # as if
218
+ "человек", # man, person, one
219
+ "чего", # genitive form of `what'
220
+ "раз", # once
221
+ "тоже", # also
222
+ "себе", # to oneself
223
+ "под", # beneath
224
+ "жизнь", # life
225
+ "будет", # will be
226
+ "ж", # short form of intensifer particle `же'
227
+ "тогда", # then
228
+ "кто", # who
229
+ "этот", # this
230
+ "говорил", # was saying
231
+ "того", # genitive form of `that'
232
+ "потому", # for that reason
233
+ "этого", # genitive form of `this'
234
+ "какой", # which
235
+ "совсем", # altogether
236
+ "ним", # prepositional form of `его', `они'
237
+ "здесь", # here
238
+ "этом", # prepositional form of `этот'
239
+ "один", # one
240
+ "почти", # almost
241
+ "мой", # my
242
+ "тем", # instrumental/dative plural of `тот', `то'
243
+ "чтобы", # full form of `in order that'
244
+ "нее", # her (acc.)
245
+ "кажется", # it seems
246
+ "сейчас", # now
247
+ "были", # they were
248
+ "куда", # where to
249
+ "зачем", # why
250
+ "сказать", # to say
251
+ "всех", # all (acc., gen. preposn. plural)
252
+ "никогда", # never
253
+ "сегодня", # today
254
+ "можно", # possible, one can
255
+ "при", # by
256
+ "наконец", # finally
257
+ "два", # two
258
+ "об", # alternative form of `о', about
259
+ "другой", # another
260
+ "хоть", # even
261
+ "после", # after
262
+ "над", # above
263
+ "больше", # more
264
+ "тот", # that one (masc.)
265
+ "через", # across, in
266
+ "эти", # these
267
+ "нас", # us
268
+ "про", # about
269
+ "всего", # in all, only, of all
270
+ "них", # prepositional form of `они' (they)
271
+ "какая", # which, feminine
272
+ "много", # lots
273
+ "разве", # interrogative particle
274
+ "сказала", # she said
275
+ "три", # three
276
+ "эту", # this, acc. fem. sing.
277
+ "моя", # my, feminine
278
+ "впрочем", # moreover, besides
279
+ "хорошо", # good
280
+ "свою", # ones own, acc. fem. sing.
281
+ "этой", # oblique form of `эта', fem. `this'
282
+ "перед", # in front of
283
+ "иногда", # sometimes
284
+ "лучше", # better
285
+ "чуть", # a little
286
+ "том", # preposn. form of `that one'
287
+ "нельзя", # one must not
288
+ "такой", # such a one
289
+ "им", # to them
290
+ "более", # more
291
+ "всегда", # always
292
+ "конечно", # of course
293
+ "всю", # acc. fem. sing of `all'
294
+ "между", # between
295
+ ]
296
+
297
+ SKIP_WORDS = {
298
+ 'en' => EN_CORPUS_SKIP_WORDS,
299
+ 'ru' => RU_CORPUS_SKIP_WORDS
300
+ }
301
+
302
+ end
303
+ end
@@ -0,0 +1,134 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ class Bayes < Classifier::Base
8
+
9
+ # The class can be created with one or more categories, each of which will be
10
+ # initialized and given a training method. E.g.,
11
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam']
12
+ # you can specify language and encoding parameters for stemmer
13
+ # (default values - :language => 'en', :encoding => 'UTF_8')
14
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam'], :language => 'ru'
15
+ def initialize(options = {})
16
+ @categories = Hash.new
17
+ options.reverse_merge!(:categories => [])
18
+ options[:categories].each { |category| @categories[prepare_category_name(category)] = Hash.new }
19
+ @total_words = 0
20
+ super
21
+ end
22
+
23
+ #
24
+ # Provides a general training method for all categories specified in Bayes#new
25
+ # For example:
26
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
27
+ # b.train :this, "This text"
28
+ # b.train "that", "That text"
29
+ # b.train "The other", "The other text"
30
+ def train(category, text)
31
+ category = prepare_category_name(category)
32
+ word_hash(text).each do |word, count|
33
+ @categories[category][word] ||= 0
34
+ @categories[category][word] += count
35
+ @total_words += count
36
+ end
37
+ end
38
+
39
+ #
40
+ # Provides a untraining method for all categories specified in Bayes#new
41
+ # Be very careful with this method.
42
+ #
43
+ # For example:
44
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
45
+ # b.train :this, "This text"
46
+ # b.untrain :this, "This text"
47
+ def untrain(category, text)
48
+ category = prepare_category_name(category)
49
+ word_hash(text).each do |word, count|
50
+ if @total_words >= 0
51
+ orig = @categories[category][word]
52
+ @categories[category][word] ||= 0
53
+ @categories[category][word] -= count
54
+ if @categories[category][word] <= 0
55
+ @categories[category].delete(word)
56
+ count = orig
57
+ end
58
+ @total_words -= count
59
+ end
60
+ end
61
+ end
62
+
63
+ #
64
+ # Returns the scores in each category the provided +text+. E.g.,
65
+ # b.classifications "I hate bad words and you"
66
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
67
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
68
+ def classifications(text)
69
+ score = Hash.new
70
+ @categories.each do |category, category_words|
71
+ score[category.to_s] = 0
72
+ total = category_words.values.sum
73
+ word_hash(text).each do |word, count|
74
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
75
+ score[category.to_s] += Math.log(s/total.to_f)
76
+ end
77
+ end
78
+ return score
79
+ end
80
+
81
+ #
82
+ # Returns the classification of the provided +text+, which is one of the
83
+ # categories given in the initializer. E.g.,
84
+ # b.classify "I hate bad words and you"
85
+ # => 'Uninteresting'
86
+ def classify(text)
87
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
88
+ end
89
+
90
+ #
91
+ # Provides training and untraining methods for the categories specified in Bayes#new
92
+ # For example:
93
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
94
+ # b.train_this "This text"
95
+ # b.train_that "That text"
96
+ # b.untrain_that "That text"
97
+ # b.train_the_other "The other text"
98
+ def method_missing(name, *args)
99
+ category = prepare_category_name(name.to_s.gsub(/(un)?train_([\w]+)/, '\2'))
100
+ if @categories.has_key? category
101
+ args.each { |text| eval("#{$1}train(category, text)") }
102
+ elsif name.to_s =~ /(un)?train_([\w]+)/
103
+ raise StandardError, "No such category: #{category}"
104
+ else
105
+ super #raise StandardError, "No such method: #{name}"
106
+ end
107
+ end
108
+
109
+ #
110
+ # Provides a list of category names
111
+ # For example:
112
+ # b.categories
113
+ # => ['This', 'That', 'the_other']
114
+ def categories # :nodoc:
115
+ @categories.keys.collect {|c| c.to_s}
116
+ end
117
+
118
+ #
119
+ # Allows you to add categories to the classifier.
120
+ # For example:
121
+ # b.add_category "Not spam"
122
+ #
123
+ # WARNING: Adding categories to a trained classifier will
124
+ # result in an undertrained category that will tend to match
125
+ # more criteria than the trained selective categories. In short,
126
+ # try to initialize your categories at initialization.
127
+ def add_category(category)
128
+ @categories[prepare_category_name(category)] = Hash.new
129
+ end
130
+
131
+ alias append_category add_category
132
+ end
133
+
134
+ end
@@ -0,0 +1,100 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Vector
10
+ def magnitude
11
+ sumsqs = 0.0
12
+ self.size.times do |i|
13
+ sumsqs += self[i] ** 2.0
14
+ end
15
+ Math.sqrt(sumsqs)
16
+ end
17
+ def normalize
18
+ nv = []
19
+ mag = self.magnitude
20
+ self.size.times do |i|
21
+
22
+ nv << (self[i] / mag)
23
+
24
+ end
25
+ Vector[*nv]
26
+ end
27
+ end
28
+
29
+ class Matrix
30
+ def Matrix.diag(s)
31
+ Matrix.diagonal(*s)
32
+ end
33
+
34
+ alias :trans :transpose
35
+
36
+ def SV_decomp(maxSweeps = 20)
37
+ if self.row_size >= self.column_size
38
+ q = self.trans * self
39
+ else
40
+ q = self * self.trans
41
+ end
42
+
43
+ qrot = q.dup
44
+ v = Matrix.identity(q.row_size)
45
+ azrot = nil
46
+ mzrot = nil
47
+ cnt = 0
48
+ s_old = nil
49
+ mu = nil
50
+
51
+ while true do
52
+ cnt += 1
53
+ for row in (0...qrot.row_size-1) do
54
+ for col in (1..qrot.row_size-1) do
55
+ next if row == col
56
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
57
+ hcos = Math.cos(h)
58
+ hsin = Math.sin(h)
59
+ mzrot = Matrix.identity(qrot.row_size)
60
+ mzrot[row,row] = hcos
61
+ mzrot[row,col] = -hsin
62
+ mzrot[col,row] = hsin
63
+ mzrot[col,col] = hcos
64
+ qrot = mzrot.trans * qrot * mzrot
65
+ v = v * mzrot
66
+ end
67
+ end
68
+ s_old = qrot.dup if cnt == 1
69
+ sum_qrot = 0.0
70
+ if cnt > 1
71
+ qrot.row_size.times do |r|
72
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
73
+ end
74
+ s_old = qrot.dup
75
+ end
76
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
77
+ end # of do while true
78
+ s = []
79
+ qrot.row_size.times do |r|
80
+ s << Math.sqrt(qrot[r,r])
81
+ end
82
+ #puts "cnt = #{cnt}"
83
+ if self.row_size >= self.column_size
84
+ mu = self * v * Matrix.diagonal(*s).inverse
85
+ return [mu, v, s]
86
+ else
87
+ puts v.row_size
88
+ puts v.column_size
89
+ puts self.row_size
90
+ puts self.column_size
91
+ puts s.size
92
+
93
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
94
+ return [mu, v, s]
95
+ end
96
+ end
97
+ def []=(i,j,val)
98
+ @rows[i][j] = val
99
+ end
100
+ end
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,72 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ # This is an internal data structure class for the LSI node. Save for
8
+ # raw_vector_with, it should be fairly straightforward to understand.
9
+ # You should never have to use it directly.
10
+ class ContentNode
11
+ attr_accessor :raw_vector, :raw_norm,
12
+ :lsi_vector, :lsi_norm,
13
+ :categories
14
+
15
+ attr_reader :word_hash
16
+ # If text_proc is not specified, the source will be duck-typed
17
+ # via source.to_s
18
+ def initialize( word_hash, *categories )
19
+ @categories = categories || []
20
+ @word_hash = word_hash
21
+ end
22
+
23
+ # Use this to fetch the appropriate search vector.
24
+ def search_vector
25
+ @lsi_vector || @raw_vector
26
+ end
27
+
28
+ # Use this to fetch the appropriate search vector in normalized form.
29
+ def search_norm
30
+ @lsi_norm || @raw_norm
31
+ end
32
+
33
+ # Creates the raw vector out of word_hash using word_list as the
34
+ # key for mapping the vector space.
35
+ def raw_vector_with( word_list )
36
+ if $GSL
37
+ vec = GSL::Vector.alloc(word_list.size)
38
+ else
39
+ vec = Array.new(word_list.size, 0)
40
+ end
41
+
42
+ @word_hash.each_key do |word|
43
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
44
+ end
45
+
46
+ # Perform the scaling transform
47
+ total_words = vec.sum.to_f
48
+
49
+ # Perform first-order association transform if this vector has more
50
+ # than one word in it.
51
+ if total_words > 1.0
52
+ weighted_total = 0.0
53
+ vec.each do |term|
54
+ if ( term > 0 )
55
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
+ end
57
+ end
58
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
59
+ end
60
+
61
+ if $GSL
62
+ @raw_norm = vec.normalize
63
+ @raw_vector = vec
64
+ else
65
+ @raw_norm = Vector[*vec].normalize
66
+ @raw_vector = Vector[*vec]
67
+ end
68
+ end
69
+
70
+ end
71
+
72
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ class String
6
+ def summary( count=10, separator=" [...] " )
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
+
10
+ def paragraph_summary( count=1, separator=" [...] " )
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
+
14
+ def split_sentences
15
+ split /(\.|\!|\?)/ # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new :auto_rebuild => false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+
9
+ class WordList
10
+ def initialize
11
+ @location_table = Hash.new
12
+ end
13
+
14
+ # Adds a word (if it is new) and assigns it a unique dimension.
15
+ def add_word(word)
16
+ term = word
17
+ @location_table[term] = @location_table.size unless @location_table[term]
18
+ end
19
+
20
+ # Returns the dimension of the word or nil if the word is not in the space.
21
+ def [](lookup)
22
+ term = lookup
23
+ @location_table[term]
24
+ end
25
+
26
+ def word_for_index(ind)
27
+ @location_table.invert[ind]
28
+ end
29
+
30
+ # Returns the number of words mapped.
31
+ def size
32
+ @location_table.size
33
+ end
34
+
35
+ end
36
+ end