noctivityinc-classifier191 1.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,306 @@
1
+ # coding:utf-8
2
+ # $KCODE = 'utf8'
3
+
4
+ module Classifier
5
+ class Base
6
+
7
+ def initialize(options = {})
8
+ options.reverse_merge!(:language => 'en')
9
+ options.reverse_merge!(:encoding => 'UTF_8')
10
+
11
+ @options = options
12
+ end
13
+
14
+ def prepare_category_name val
15
+ val.to_s.gsub("_"," ").capitalize.intern
16
+ end
17
+
18
+ # Removes common punctuation symbols, returning a new string.
19
+ # E.g.,
20
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
21
+ # => "Hello greetings with braces "
22
+ def without_punctuation str
23
+ str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
24
+ end
25
+
26
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
27
+ # interned, and indexes to its frequency in the document.
28
+ def word_hash str
29
+ word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
30
+ end
31
+
32
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
33
+ def clean_word_hash str
34
+ word_hash_for_words str.gsub(/[^\w\s]/,"").split
35
+ end
36
+
37
+ private
38
+
39
+ def word_hash_for_words(words)
40
+ stemmer = Lingua::Stemmer.new(@options)
41
+ d = Hash.new
42
+ skip_words = SKIP_WORDS[@options[:language]] || []
43
+ words.each do |word|
44
+ word = word.mb_chars.downcase.to_s if word =~ /[\w]+/
45
+ key = stemmer.stem(word).intern
46
+ if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2
47
+ d[key] ||= 0
48
+ d[key] += 1
49
+ end
50
+ end
51
+ return d
52
+ end
53
+
54
+ EN_CORPUS_SKIP_WORDS = [
55
+ "a",
56
+ "again",
57
+ "all",
58
+ "along",
59
+ "are",
60
+ "also",
61
+ "an",
62
+ "and",
63
+ "as",
64
+ "at",
65
+ "but",
66
+ "by",
67
+ "came",
68
+ "can",
69
+ "cant",
70
+ "couldnt",
71
+ "did",
72
+ "didn",
73
+ "didnt",
74
+ "do",
75
+ "doesnt",
76
+ "dont",
77
+ "ever",
78
+ "first",
79
+ "from",
80
+ "have",
81
+ "her",
82
+ "here",
83
+ "him",
84
+ "how",
85
+ "i",
86
+ "if",
87
+ "in",
88
+ "into",
89
+ "is",
90
+ "isnt",
91
+ "it",
92
+ "itll",
93
+ "just",
94
+ "last",
95
+ "least",
96
+ "like",
97
+ "most",
98
+ "my",
99
+ "new",
100
+ "no",
101
+ "not",
102
+ "now",
103
+ "of",
104
+ "on",
105
+ "or",
106
+ "should",
107
+ "sinc",
108
+ "so",
109
+ "some",
110
+ "th",
111
+ "than",
112
+ "this",
113
+ "that",
114
+ "the",
115
+ "their",
116
+ "then",
117
+ "those",
118
+ "to",
119
+ "told",
120
+ "too",
121
+ "true",
122
+ "try",
123
+ "until",
124
+ "url",
125
+ "us",
126
+ "were",
127
+ "when",
128
+ "whether",
129
+ "while",
130
+ "with",
131
+ "within",
132
+ "yes",
133
+ "you",
134
+ "youll",
135
+ ]
136
+
137
+ # http://snowball.tartarus.org/algorithms/russian/stop.txt
138
+ RU_CORPUS_SKIP_WORDS = [
139
+ "и", # and
140
+ "в", # in/into
141
+ "во", # alternative form
142
+ "не", # not
143
+ "что", # what/that
144
+ "он", # he
145
+ "на", # on/onto
146
+ "я", # i
147
+ "с", # from
148
+ "со", # alternative form
149
+ "как", # how
150
+ "а", # milder form of `no' (but)
151
+ "то", # conjunction and form of `that'
152
+ "все", # all
153
+ "она", # she
154
+ "так", # so, thus
155
+ "его", # him
156
+ "но", # but
157
+ "да", # yes/and
158
+ "ты", # thou
159
+ "к", # towards, by
160
+ "у", # around, chez
161
+ "же", # intensifier particle
162
+ "вы", # you
163
+ "за", # beyond, behind
164
+ "бы", # conditional/subj. particle
165
+ "по", # up to, along
166
+ "только", # only
167
+ "ее", # her
168
+ "мне", # to me
169
+ "было", # it was
170
+ "вот", # here is/are, particle
171
+ "от", # away from
172
+ "меня", # me
173
+ "еще", # still, yet, more
174
+ "нет", # no, there isnt/arent
175
+ "о", # about
176
+ "из", # out of
177
+ "ему", # to him
178
+ "теперь", # now
179
+ "когда", # when
180
+ "даже", # even
181
+ "ну", # so, well
182
+ "вдруг", # suddenly
183
+ "ли", # interrogative particle
184
+ "если", # if
185
+ "уже", # already, but homonym of `narrower'
186
+ "или", # or
187
+ "ни", # neither
188
+ "быть", # to be
189
+ "был", # he was
190
+ "него", # prepositional form of его
191
+ "до", # up to
192
+ "вас", # you accusative
193
+ "нибудь", # indef. suffix preceded by hyphen
194
+ "опять", # again
195
+ "уж", # already, but homonym of `adder'
196
+ "вам", # to you
197
+ "сказал", # he said
198
+ "ведь", # particle `after all'
199
+ "там", # there
200
+ "потом", # then
201
+ "себя", # oneself
202
+ "ничего", # nothing
203
+ "ей", # to her
204
+ "может", # usually with `быть' as `maybe'
205
+ "они", # they
206
+ "тут", # here
207
+ "где", # where
208
+ "есть", # there is/are
209
+ "надо", # got to, must
210
+ "ней", # prepositional form of ей
211
+ "для", # for
212
+ "мы", # we
213
+ "тебя", # thee
214
+ "их", # them, their
215
+ "чем", # than
216
+ "была", # she was
217
+ "сам", # self
218
+ "чтоб", # in order to
219
+ "без", # without
220
+ "будто", # as if
221
+ "человек", # man, person, one
222
+ "чего", # genitive form of `what'
223
+ "раз", # once
224
+ "тоже", # also
225
+ "себе", # to oneself
226
+ "под", # beneath
227
+ "жизнь", # life
228
+ "будет", # will be
229
+ "ж", # short form of intensifer particle `же'
230
+ "тогда", # then
231
+ "кто", # who
232
+ "этот", # this
233
+ "говорил", # was saying
234
+ "того", # genitive form of `that'
235
+ "потому", # for that reason
236
+ "этого", # genitive form of `this'
237
+ "какой", # which
238
+ "совсем", # altogether
239
+ "ним", # prepositional form of `его', `они'
240
+ "здесь", # here
241
+ "этом", # prepositional form of `этот'
242
+ "один", # one
243
+ "почти", # almost
244
+ "мой", # my
245
+ "тем", # instrumental/dative plural of `тот', `то'
246
+ "чтобы", # full form of `in order that'
247
+ "нее", # her (acc.)
248
+ "кажется", # it seems
249
+ "сейчас", # now
250
+ "были", # they were
251
+ "куда", # where to
252
+ "зачем", # why
253
+ "сказать", # to say
254
+ "всех", # all (acc., gen. preposn. plural)
255
+ "никогда", # never
256
+ "сегодня", # today
257
+ "можно", # possible, one can
258
+ "при", # by
259
+ "наконец", # finally
260
+ "два", # two
261
+ "об", # alternative form of `о', about
262
+ "другой", # another
263
+ "хоть", # even
264
+ "после", # after
265
+ "над", # above
266
+ "больше", # more
267
+ "тот", # that one (masc.)
268
+ "через", # across, in
269
+ "эти", # these
270
+ "нас", # us
271
+ "про", # about
272
+ "всего", # in all, only, of all
273
+ "них", # prepositional form of `они' (they)
274
+ "какая", # which, feminine
275
+ "много", # lots
276
+ "разве", # interrogative particle
277
+ "сказала", # she said
278
+ "три", # three
279
+ "эту", # this, acc. fem. sing.
280
+ "моя", # my, feminine
281
+ "впрочем", # moreover, besides
282
+ "хорошо", # good
283
+ "свою", # ones own, acc. fem. sing.
284
+ "этой", # oblique form of `эта', fem. `this'
285
+ "перед", # in front of
286
+ "иногда", # sometimes
287
+ "лучше", # better
288
+ "чуть", # a little
289
+ "том", # preposn. form of `that one'
290
+ "нельзя", # one must not
291
+ "такой", # such a one
292
+ "им", # to them
293
+ "более", # more
294
+ "всегда", # always
295
+ "конечно", # of course
296
+ "всю", # acc. fem. sing of `all'
297
+ "между", # between
298
+ ]
299
+
300
+ SKIP_WORDS = {
301
+ 'en' => EN_CORPUS_SKIP_WORDS,
302
+ 'ru' => RU_CORPUS_SKIP_WORDS
303
+ }
304
+
305
+ end
306
+ end
@@ -0,0 +1,134 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ class Bayes < Classifier::Base
8
+
9
+ # The class can be created with one or more categories, each of which will be
10
+ # initialized and given a training method. E.g.,
11
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam']
12
+ # you can specify language and encoding parameters for stemmer
13
+ # (default values - :language => 'en', :encoding => 'UTF_8')
14
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam'], :language => 'ru'
15
+ def initialize(options = {})
16
+ @categories = Hash.new
17
+ options.reverse_merge!(:categories => [])
18
+ options[:categories].each { |category| @categories[prepare_category_name(category)] = Hash.new }
19
+ @total_words = 0
20
+ super
21
+ end
22
+
23
+ #
24
+ # Provides a general training method for all categories specified in Bayes#new
25
+ # For example:
26
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
27
+ # b.train :this, "This text"
28
+ # b.train "that", "That text"
29
+ # b.train "The other", "The other text"
30
+ def train(category, text)
31
+ category = prepare_category_name(category)
32
+ word_hash(text).each do |word, count|
33
+ @categories[category][word] ||= 0
34
+ @categories[category][word] += count
35
+ @total_words += count
36
+ end
37
+ end
38
+
39
+ #
40
+ # Provides a untraining method for all categories specified in Bayes#new
41
+ # Be very careful with this method.
42
+ #
43
+ # For example:
44
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
45
+ # b.train :this, "This text"
46
+ # b.untrain :this, "This text"
47
+ def untrain(category, text)
48
+ category = prepare_category_name(category)
49
+ word_hash(text).each do |word, count|
50
+ if @total_words >= 0
51
+ orig = @categories[category][word]
52
+ @categories[category][word] ||= 0
53
+ @categories[category][word] -= count
54
+ if @categories[category][word] <= 0
55
+ @categories[category].delete(word)
56
+ count = orig
57
+ end
58
+ @total_words -= count
59
+ end
60
+ end
61
+ end
62
+
63
+ #
64
+ # Returns the scores in each category the provided +text+. E.g.,
65
+ # b.classifications "I hate bad words and you"
66
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
67
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
68
+ def classifications(text)
69
+ score = Hash.new
70
+ @categories.each do |category, category_words|
71
+ score[category.to_s] = 0
72
+ total = category_words.values.sum
73
+ word_hash(text).each do |word, count|
74
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
75
+ score[category.to_s] += Math.log(s/total.to_f)
76
+ end
77
+ end
78
+ return score
79
+ end
80
+
81
+ #
82
+ # Returns the classification of the provided +text+, which is one of the
83
+ # categories given in the initializer. E.g.,
84
+ # b.classify "I hate bad words and you"
85
+ # => 'Uninteresting'
86
+ def classify(text)
87
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
88
+ end
89
+
90
+ #
91
+ # Provides training and untraining methods for the categories specified in Bayes#new
92
+ # For example:
93
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
94
+ # b.train_this "This text"
95
+ # b.train_that "That text"
96
+ # b.untrain_that "That text"
97
+ # b.train_the_other "The other text"
98
+ def method_missing(name, *args)
99
+ category = prepare_category_name(name.to_s.gsub(/(un)?train_([\w]+)/, '\2'))
100
+ if @categories.has_key? category
101
+ args.each { |text| eval("#{$1}train(category, text)") }
102
+ elsif name.to_s =~ /(un)?train_([\w]+)/
103
+ raise StandardError, "No such category: #{category}"
104
+ else
105
+ super #raise StandardError, "No such method: #{name}"
106
+ end
107
+ end
108
+
109
+ #
110
+ # Provides a list of category names
111
+ # For example:
112
+ # b.categories
113
+ # => ['This', 'That', 'the_other']
114
+ def categories # :nodoc:
115
+ @categories.keys.collect {|c| c.to_s}
116
+ end
117
+
118
+ #
119
+ # Allows you to add categories to the classifier.
120
+ # For example:
121
+ # b.add_category "Not spam"
122
+ #
123
+ # WARNING: Adding categories to a trained classifier will
124
+ # result in an undertrained category that will tend to match
125
+ # more criteria than the trained selective categories. In short,
126
+ # try to initialize your categories at initialization.
127
+ def add_category(category)
128
+ @categories[prepare_category_name(category)] = Hash.new
129
+ end
130
+
131
+ alias append_category add_category
132
+ end
133
+
134
+ end
@@ -0,0 +1,100 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Vector
10
+ def magnitude
11
+ sumsqs = 0.0
12
+ self.size.times do |i|
13
+ sumsqs += self[i] ** 2.0
14
+ end
15
+ Math.sqrt(sumsqs)
16
+ end
17
+ def normalize
18
+ nv = []
19
+ mag = self.magnitude
20
+ self.size.times do |i|
21
+
22
+ nv << (self[i] / mag)
23
+
24
+ end
25
+ Vector[*nv]
26
+ end
27
+ end
28
+
29
+ class Matrix
30
+ def Matrix.diag(s)
31
+ Matrix.diagonal(*s)
32
+ end
33
+
34
+ alias :trans :transpose
35
+
36
+ def SV_decomp(maxSweeps = 20)
37
+ if self.row_size >= self.column_size
38
+ q = self.trans * self
39
+ else
40
+ q = self * self.trans
41
+ end
42
+
43
+ qrot = q.dup
44
+ v = Matrix.identity(q.row_size)
45
+ azrot = nil
46
+ mzrot = nil
47
+ cnt = 0
48
+ s_old = nil
49
+ mu = nil
50
+
51
+ while true do
52
+ cnt += 1
53
+ for row in (0...qrot.row_size-1) do
54
+ for col in (1..qrot.row_size-1) do
55
+ next if row == col
56
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
57
+ hcos = Math.cos(h)
58
+ hsin = Math.sin(h)
59
+ mzrot = Matrix.identity(qrot.row_size)
60
+ mzrot[row,row] = hcos
61
+ mzrot[row,col] = -hsin
62
+ mzrot[col,row] = hsin
63
+ mzrot[col,col] = hcos
64
+ qrot = mzrot.trans * qrot * mzrot
65
+ v = v * mzrot
66
+ end
67
+ end
68
+ s_old = qrot.dup if cnt == 1
69
+ sum_qrot = 0.0
70
+ if cnt > 1
71
+ qrot.row_size.times do |r|
72
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
73
+ end
74
+ s_old = qrot.dup
75
+ end
76
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
77
+ end # of do while true
78
+ s = []
79
+ qrot.row_size.times do |r|
80
+ s << Math.sqrt(qrot[r,r])
81
+ end
82
+ #puts "cnt = #{cnt}"
83
+ if self.row_size >= self.column_size
84
+ mu = self * v * Matrix.diagonal(*s).inverse
85
+ return [mu, v, s]
86
+ else
87
+ puts v.row_size
88
+ puts v.column_size
89
+ puts self.row_size
90
+ puts self.column_size
91
+ puts s.size
92
+
93
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
94
+ return [mu, v, s]
95
+ end
96
+ end
97
+ def []=(i,j,val)
98
+ @rows[i][j] = val
99
+ end
100
+ end
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,73 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ # This is an internal data structure class for the LSI node. Save for
8
+ # raw_vector_with, it should be fairly straightforward to understand.
9
+ # You should never have to use it directly.
10
+ class ContentNode
11
+ attr_accessor :raw_vector, :raw_norm,
12
+ :lsi_vector, :lsi_norm,
13
+ :categories
14
+
15
+ attr_reader :word_hash
16
+ # If text_proc is not specified, the source will be duck-typed
17
+ # via source.to_s
18
+ def initialize( word_hash, *categories )
19
+ @categories = categories || []
20
+ @word_hash = word_hash
21
+ end
22
+
23
+ # Use this to fetch the appropriate search vector.
24
+ def search_vector
25
+ @lsi_vector || @raw_vector
26
+ end
27
+
28
+ # Use this to fetch the appropriate search vector in normalized form.
29
+ def search_norm
30
+ @lsi_norm || @raw_norm
31
+ end
32
+
33
+ # Creates the raw vector out of word_hash using word_list as the
34
+ # key for mapping the vector space.
35
+ def raw_vector_with( word_list )
36
+ if $GSL
37
+ vec = GSL::Vector.alloc(word_list.size)
38
+ else
39
+ vec = Array.new(word_list.size, 0)
40
+ end
41
+
42
+ @word_hash.each_key do |word|
43
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
44
+ end
45
+
46
+ # Perform the scaling transform
47
+ total_words = vec.sum.to_f
48
+
49
+ # Perform first-order association transform if this vector has more
50
+ # than one word in it.
51
+ if total_words > 1.0
52
+ weighted_total = 0.0
53
+ vec.each do |term|
54
+ if ( term > 0 )
55
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
+ end
57
+ end
58
+ weighted_total = -1.0 if weighted_total.zero? # if no word in list is known
59
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
60
+ end
61
+
62
+ if $GSL
63
+ @raw_norm = vec.normalize
64
+ @raw_vector = vec
65
+ else
66
+ @raw_norm = Vector[*vec].normalize
67
+ @raw_vector = Vector[*vec]
68
+ end
69
+ end
70
+
71
+ end
72
+
73
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ class String
6
+ def summary( count=10, separator=" [...] " )
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
+
10
+ def paragraph_summary( count=1, separator=" [...] " )
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
+
14
+ def split_sentences
15
+ split /(\.|\!|\?)/ # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new :auto_rebuild => false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+
9
+ class WordList
10
+ def initialize
11
+ @location_table = Hash.new
12
+ end
13
+
14
+ # Adds a word (if it is new) and assigns it a unique dimension.
15
+ def add_word(word)
16
+ term = word
17
+ @location_table[term] = @location_table.size unless @location_table[term]
18
+ end
19
+
20
+ # Returns the dimension of the word or nil if the word is not in the space.
21
+ def [](lookup)
22
+ term = lookup
23
+ @location_table[term]
24
+ end
25
+
26
+ def word_for_index(ind)
27
+ @location_table.invert[ind]
28
+ end
29
+
30
+ # Returns the number of words mapped.
31
+ def size
32
+ @location_table.size
33
+ end
34
+
35
+ end
36
+ end