esilverberg-classifier 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,309 @@
1
+ # coding:utf-8
2
+ $KCODE = 'utf8'
3
+
4
+ module Classifier
5
+ class Base
6
+
7
+ def initialize(options = {})
8
+ options.reverse_merge!(:language => 'en')
9
+ options.reverse_merge!(:encoding => 'UTF_8')
10
+
11
+ @options = options
12
+ end
13
+
14
+ def prepare_category_name val
15
+ val.to_s.gsub("_"," ").capitalize.intern
16
+ end
17
+
18
+ # Removes common punctuation symbols, returning a new string.
19
+ # E.g.,
20
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
21
+ # => "Hello greetings with braces "
22
+ def without_punctuation str
23
+ str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
24
+ end
25
+
26
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
27
+ # interned, and indexes to its frequency in the document.
28
+ def word_hash str
29
+ word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
30
+ end
31
+
32
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
33
+ def clean_word_hash str
34
+ word_hash_for_words str.gsub(/[^\w\s]/,"").split
35
+ end
36
+
37
+ private
38
+
39
+ def stemmer
40
+ @stemmer ||= Lingua::Stemmer.new(@options)
41
+ end
42
+
43
+ def word_hash_for_words(words)
44
+ d = Hash.new
45
+ skip_words = SKIP_WORDS[@options[:language]] || []
46
+ words.each do |word|
47
+ word = word.mb_chars.downcase.to_s if word =~ /[\w]+/
48
+ key = stemmer.stem(word).intern
49
+ if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2
50
+ d[key] ||= 0
51
+ d[key] += 1
52
+ end
53
+ end
54
+ return d
55
+ end
56
+
57
+ EN_CORPUS_SKIP_WORDS = [
58
+ "a",
59
+ "again",
60
+ "all",
61
+ "along",
62
+ "are",
63
+ "also",
64
+ "an",
65
+ "and",
66
+ "as",
67
+ "at",
68
+ "but",
69
+ "by",
70
+ "came",
71
+ "can",
72
+ "cant",
73
+ "couldnt",
74
+ "did",
75
+ "didn",
76
+ "didnt",
77
+ "do",
78
+ "doesnt",
79
+ "dont",
80
+ "ever",
81
+ "first",
82
+ "from",
83
+ "have",
84
+ "her",
85
+ "here",
86
+ "him",
87
+ "how",
88
+ "i",
89
+ "if",
90
+ "in",
91
+ "into",
92
+ "is",
93
+ "isnt",
94
+ "it",
95
+ "itll",
96
+ "just",
97
+ "last",
98
+ "least",
99
+ "like",
100
+ "most",
101
+ "my",
102
+ "new",
103
+ "no",
104
+ "not",
105
+ "now",
106
+ "of",
107
+ "on",
108
+ "or",
109
+ "should",
110
+ "sinc",
111
+ "so",
112
+ "some",
113
+ "th",
114
+ "than",
115
+ "this",
116
+ "that",
117
+ "the",
118
+ "their",
119
+ "then",
120
+ "those",
121
+ "to",
122
+ "told",
123
+ "too",
124
+ "true",
125
+ "try",
126
+ "until",
127
+ "url",
128
+ "us",
129
+ "were",
130
+ "when",
131
+ "whether",
132
+ "while",
133
+ "with",
134
+ "within",
135
+ "yes",
136
+ "you",
137
+ "youll",
138
+ ]
139
+
140
+ # http://snowball.tartarus.org/algorithms/russian/stop.txt
141
+ RU_CORPUS_SKIP_WORDS = [
142
+ "и", # and
143
+ "в", # in/into
144
+ "во", # alternative form
145
+ "не", # not
146
+ "что", # what/that
147
+ "он", # he
148
+ "на", # on/onto
149
+ "я", # i
150
+ "с", # from
151
+ "со", # alternative form
152
+ "как", # how
153
+ "а", # milder form of `no' (but)
154
+ "то", # conjunction and form of `that'
155
+ "все", # all
156
+ "она", # she
157
+ "так", # so, thus
158
+ "его", # him
159
+ "но", # but
160
+ "да", # yes/and
161
+ "ты", # thou
162
+ "к", # towards, by
163
+ "у", # around, chez
164
+ "же", # intensifier particle
165
+ "вы", # you
166
+ "за", # beyond, behind
167
+ "бы", # conditional/subj. particle
168
+ "по", # up to, along
169
+ "только", # only
170
+ "ее", # her
171
+ "мне", # to me
172
+ "было", # it was
173
+ "вот", # here is/are, particle
174
+ "от", # away from
175
+ "меня", # me
176
+ "еще", # still, yet, more
177
+ "нет", # no, there isnt/arent
178
+ "о", # about
179
+ "из", # out of
180
+ "ему", # to him
181
+ "теперь", # now
182
+ "когда", # when
183
+ "даже", # even
184
+ "ну", # so, well
185
+ "вдруг", # suddenly
186
+ "ли", # interrogative particle
187
+ "если", # if
188
+ "уже", # already, but homonym of `narrower'
189
+ "или", # or
190
+ "ни", # neither
191
+ "быть", # to be
192
+ "был", # he was
193
+ "него", # prepositional form of его
194
+ "до", # up to
195
+ "вас", # you accusative
196
+ "нибудь", # indef. suffix preceded by hyphen
197
+ "опять", # again
198
+ "уж", # already, but homonym of `adder'
199
+ "вам", # to you
200
+ "сказал", # he said
201
+ "ведь", # particle `after all'
202
+ "там", # there
203
+ "потом", # then
204
+ "себя", # oneself
205
+ "ничего", # nothing
206
+ "ей", # to her
207
+ "может", # usually with `быть' as `maybe'
208
+ "они", # they
209
+ "тут", # here
210
+ "где", # where
211
+ "есть", # there is/are
212
+ "надо", # got to, must
213
+ "ней", # prepositional form of ей
214
+ "для", # for
215
+ "мы", # we
216
+ "тебя", # thee
217
+ "их", # them, their
218
+ "чем", # than
219
+ "была", # she was
220
+ "сам", # self
221
+ "чтоб", # in order to
222
+ "без", # without
223
+ "будто", # as if
224
+ "человек", # man, person, one
225
+ "чего", # genitive form of `what'
226
+ "раз", # once
227
+ "тоже", # also
228
+ "себе", # to oneself
229
+ "под", # beneath
230
+ "жизнь", # life
231
+ "будет", # will be
232
+ "ж", # short form of intensifer particle `же'
233
+ "тогда", # then
234
+ "кто", # who
235
+ "этот", # this
236
+ "говорил", # was saying
237
+ "того", # genitive form of `that'
238
+ "потому", # for that reason
239
+ "этого", # genitive form of `this'
240
+ "какой", # which
241
+ "совсем", # altogether
242
+ "ним", # prepositional form of `его', `они'
243
+ "здесь", # here
244
+ "этом", # prepositional form of `этот'
245
+ "один", # one
246
+ "почти", # almost
247
+ "мой", # my
248
+ "тем", # instrumental/dative plural of `тот', `то'
249
+ "чтобы", # full form of `in order that'
250
+ "нее", # her (acc.)
251
+ "кажется", # it seems
252
+ "сейчас", # now
253
+ "были", # they were
254
+ "куда", # where to
255
+ "зачем", # why
256
+ "сказать", # to say
257
+ "всех", # all (acc., gen. preposn. plural)
258
+ "никогда", # never
259
+ "сегодня", # today
260
+ "можно", # possible, one can
261
+ "при", # by
262
+ "наконец", # finally
263
+ "два", # two
264
+ "об", # alternative form of `о', about
265
+ "другой", # another
266
+ "хоть", # even
267
+ "после", # after
268
+ "над", # above
269
+ "больше", # more
270
+ "тот", # that one (masc.)
271
+ "через", # across, in
272
+ "эти", # these
273
+ "нас", # us
274
+ "про", # about
275
+ "всего", # in all, only, of all
276
+ "них", # prepositional form of `они' (they)
277
+ "какая", # which, feminine
278
+ "много", # lots
279
+ "разве", # interrogative particle
280
+ "сказала", # she said
281
+ "три", # three
282
+ "эту", # this, acc. fem. sing.
283
+ "моя", # my, feminine
284
+ "впрочем", # moreover, besides
285
+ "хорошо", # good
286
+ "свою", # ones own, acc. fem. sing.
287
+ "этой", # oblique form of `эта', fem. `this'
288
+ "перед", # in front of
289
+ "иногда", # sometimes
290
+ "лучше", # better
291
+ "чуть", # a little
292
+ "том", # preposn. form of `that one'
293
+ "нельзя", # one must not
294
+ "такой", # such a one
295
+ "им", # to them
296
+ "более", # more
297
+ "всегда", # always
298
+ "конечно", # of course
299
+ "всю", # acc. fem. sing of `all'
300
+ "между", # between
301
+ ]
302
+
303
+ SKIP_WORDS = {
304
+ 'en' => EN_CORPUS_SKIP_WORDS,
305
+ 'ru' => RU_CORPUS_SKIP_WORDS
306
+ }
307
+
308
+ end
309
+ end
@@ -0,0 +1,134 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ class Bayes < Classifier::Base
8
+
9
+ # The class can be created with one or more categories, each of which will be
10
+ # initialized and given a training method. E.g.,
11
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam']
12
+ # you can specify language and encoding parameters for stemmer
13
+ # (default values - :language => 'en', :encoding => 'UTF_8')
14
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam'], :language => 'ru'
15
+ def initialize(options = {})
16
+ @categories = Hash.new
17
+ options.reverse_merge!(:categories => [])
18
+ options[:categories].each { |category| @categories[prepare_category_name(category)] = Hash.new }
19
+ @total_words = 0
20
+ super
21
+ end
22
+
23
+ #
24
+ # Provides a general training method for all categories specified in Bayes#new
25
+ # For example:
26
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
27
+ # b.train :this, "This text"
28
+ # b.train "that", "That text"
29
+ # b.train "The other", "The other text"
30
+ def train(category, text)
31
+ category = prepare_category_name(category)
32
+ word_hash(text).each do |word, count|
33
+ @categories[category][word] ||= 0
34
+ @categories[category][word] += count
35
+ @total_words += count
36
+ end
37
+ end
38
+
39
+ #
40
+ # Provides a untraining method for all categories specified in Bayes#new
41
+ # Be very careful with this method.
42
+ #
43
+ # For example:
44
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
45
+ # b.train :this, "This text"
46
+ # b.untrain :this, "This text"
47
+ def untrain(category, text)
48
+ category = prepare_category_name(category)
49
+ word_hash(text).each do |word, count|
50
+ if @total_words >= 0
51
+ orig = @categories[category][word]
52
+ @categories[category][word] ||= 0
53
+ @categories[category][word] -= count
54
+ if @categories[category][word] <= 0
55
+ @categories[category].delete(word)
56
+ count = orig
57
+ end
58
+ @total_words -= count
59
+ end
60
+ end
61
+ end
62
+
63
+ #
64
+ # Returns the scores in each category the provided +text+. E.g.,
65
+ # b.classifications "I hate bad words and you"
66
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
67
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
68
+ def classifications(text)
69
+ score = Hash.new
70
+ @categories.each do |category, category_words|
71
+ score[category.to_s] = 0
72
+ total = category_words.values.sum
73
+ word_hash(text).each do |word, count|
74
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
75
+ score[category.to_s] += Math.log(s/total.to_f)
76
+ end
77
+ end
78
+ return score
79
+ end
80
+
81
+ #
82
+ # Returns the classification of the provided +text+, which is one of the
83
+ # categories given in the initializer. E.g.,
84
+ # b.classify "I hate bad words and you"
85
+ # => 'Uninteresting'
86
+ def classify(text)
87
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
88
+ end
89
+
90
+ #
91
+ # Provides training and untraining methods for the categories specified in Bayes#new
92
+ # For example:
93
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
94
+ # b.train_this "This text"
95
+ # b.train_that "That text"
96
+ # b.untrain_that "That text"
97
+ # b.train_the_other "The other text"
98
+ def method_missing(name, *args)
99
+ category = prepare_category_name(name.to_s.gsub(/(un)?train_([\w]+)/, '\2'))
100
+ if @categories.has_key? category
101
+ args.each { |text| eval("#{$1}train(category, text)") }
102
+ elsif name.to_s =~ /(un)?train_([\w]+)/
103
+ raise StandardError, "No such category: #{category}"
104
+ else
105
+ super #raise StandardError, "No such method: #{name}"
106
+ end
107
+ end
108
+
109
+ #
110
+ # Provides a list of category names
111
+ # For example:
112
+ # b.categories
113
+ # => ['This', 'That', 'the_other']
114
+ def categories # :nodoc:
115
+ @categories.keys.collect {|c| c.to_s}
116
+ end
117
+
118
+ #
119
+ # Allows you to add categories to the classifier.
120
+ # For example:
121
+ # b.add_category "Not spam"
122
+ #
123
+ # WARNING: Adding categories to a trained classifier will
124
+ # result in an undertrained category that will tend to match
125
+ # more criteria than the trained selective categories. In short,
126
+ # try to initialize your categories at initialization.
127
+ def add_category(category)
128
+ @categories[prepare_category_name(category)] = Hash.new
129
+ end
130
+
131
+ alias append_category add_category
132
+ end
133
+
134
+ end
@@ -0,0 +1,100 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Vector
10
+ def magnitude
11
+ sumsqs = 0.0
12
+ self.size.times do |i|
13
+ sumsqs += self[i] ** 2.0
14
+ end
15
+ Math.sqrt(sumsqs)
16
+ end
17
+ def normalize
18
+ nv = []
19
+ mag = self.magnitude
20
+ self.size.times do |i|
21
+
22
+ nv << (self[i] / mag)
23
+
24
+ end
25
+ Vector[*nv]
26
+ end
27
+ end
28
+
29
+ class Matrix
30
+ def Matrix.diag(s)
31
+ Matrix.diagonal(*s)
32
+ end
33
+
34
+ alias :trans :transpose
35
+
36
+ def SV_decomp(maxSweeps = 20)
37
+ if self.row_size >= self.column_size
38
+ q = self.trans * self
39
+ else
40
+ q = self * self.trans
41
+ end
42
+
43
+ qrot = q.dup
44
+ v = Matrix.identity(q.row_size)
45
+ azrot = nil
46
+ mzrot = nil
47
+ cnt = 0
48
+ s_old = nil
49
+ mu = nil
50
+
51
+ while true do
52
+ cnt += 1
53
+ for row in (0...qrot.row_size-1) do
54
+ for col in (1..qrot.row_size-1) do
55
+ next if row == col
56
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
57
+ hcos = Math.cos(h)
58
+ hsin = Math.sin(h)
59
+ mzrot = Matrix.identity(qrot.row_size)
60
+ mzrot[row,row] = hcos
61
+ mzrot[row,col] = -hsin
62
+ mzrot[col,row] = hsin
63
+ mzrot[col,col] = hcos
64
+ qrot = mzrot.trans * qrot * mzrot
65
+ v = v * mzrot
66
+ end
67
+ end
68
+ s_old = qrot.dup if cnt == 1
69
+ sum_qrot = 0.0
70
+ if cnt > 1
71
+ qrot.row_size.times do |r|
72
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
73
+ end
74
+ s_old = qrot.dup
75
+ end
76
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
77
+ end # of do while true
78
+ s = []
79
+ qrot.row_size.times do |r|
80
+ s << Math.sqrt(qrot[r,r])
81
+ end
82
+ #puts "cnt = #{cnt}"
83
+ if self.row_size >= self.column_size
84
+ mu = self * v * Matrix.diagonal(*s).inverse
85
+ return [mu, v, s]
86
+ else
87
+ puts v.row_size
88
+ puts v.column_size
89
+ puts self.row_size
90
+ puts self.column_size
91
+ puts s.size
92
+
93
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
94
+ return [mu, v, s]
95
+ end
96
+ end
97
+ def []=(i,j,val)
98
+ @rows[i][j] = val
99
+ end
100
+ end
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,73 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ # This is an internal data structure class for the LSI node. Save for
8
+ # raw_vector_with, it should be fairly straightforward to understand.
9
+ # You should never have to use it directly.
10
+ class ContentNode
11
+ attr_accessor :raw_vector, :raw_norm,
12
+ :lsi_vector, :lsi_norm,
13
+ :categories
14
+
15
+ attr_reader :word_hash
16
+ # If text_proc is not specified, the source will be duck-typed
17
+ # via source.to_s
18
+ def initialize( word_hash, *categories )
19
+ @categories = categories || []
20
+ @word_hash = word_hash
21
+ end
22
+
23
+ # Use this to fetch the appropriate search vector.
24
+ def search_vector
25
+ @lsi_vector || @raw_vector
26
+ end
27
+
28
+ # Use this to fetch the appropriate search vector in normalized form.
29
+ def search_norm
30
+ @lsi_norm || @raw_norm
31
+ end
32
+
33
+ # Creates the raw vector out of word_hash using word_list as the
34
+ # key for mapping the vector space.
35
+ def raw_vector_with( word_list )
36
+ if $GSL
37
+ vec = GSL::Vector.alloc(word_list.size)
38
+ else
39
+ vec = Array.new(word_list.size, 0)
40
+ end
41
+
42
+ @word_hash.each_key do |word|
43
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
44
+ end
45
+
46
+ # Perform the scaling transform
47
+ total_words = vec.sum.to_f
48
+
49
+ # Perform first-order association transform if this vector has more
50
+ # than one word in it.
51
+ if total_words > 1.0
52
+ weighted_total = 0.0
53
+ vec.each do |term|
54
+ if ( term > 0 )
55
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
+ end
57
+ end
58
+ weighted_total = -1.0 if weighted_total.zero? # if no word in list is known
59
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
60
+ end
61
+
62
+ if $GSL
63
+ @raw_norm = vec.normalize
64
+ @raw_vector = vec
65
+ else
66
+ @raw_norm = Vector[*vec].normalize
67
+ @raw_vector = Vector[*vec]
68
+ end
69
+ end
70
+
71
+ end
72
+
73
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ class String
6
+ def summary( count=10, separator=" [...] " )
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
+
10
+ def paragraph_summary( count=1, separator=" [...] " )
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
+
14
+ def split_sentences
15
+ split /(\.|\!|\?)/ # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new :auto_rebuild => false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+
9
+ class WordList
10
+ def initialize
11
+ @location_table = Hash.new
12
+ end
13
+
14
+ # Adds a word (if it is new) and assigns it a unique dimension.
15
+ def add_word(word)
16
+ term = word
17
+ @location_table[term] = @location_table.size unless @location_table[term]
18
+ end
19
+
20
+ # Returns the dimension of the word or nil if the word is not in the space.
21
+ def [](lookup)
22
+ term = lookup
23
+ @location_table[term]
24
+ end
25
+
26
+ def word_for_index(ind)
27
+ @location_table.invert[ind]
28
+ end
29
+
30
+ # Returns the number of words mapped.
31
+ def size
32
+ @location_table.size
33
+ end
34
+
35
+ end
36
+ end