francois-classifier 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,306 @@
1
+ # coding:utf-8
2
+ $KCODE = 'utf8'
3
+
4
+ module Classifier
5
+ class Base
6
+
7
+ def initialize(options = {})
8
+ options.reverse_merge!(:language => 'en')
9
+ options.reverse_merge!(:encoding => 'UTF_8')
10
+
11
+ @options = options
12
+ end
13
+
14
+ def prepare_category_name val
15
+ val.to_s.gsub("_"," ").capitalize.intern
16
+ end
17
+
18
+ # Removes common punctuation symbols, returning a new string.
19
+ # E.g.,
20
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
21
+ # => "Hello greetings with braces "
22
+ def without_punctuation str
23
+ str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
24
+ end
25
+
26
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
27
+ # interned, and indexes to its frequency in the document.
28
+ def word_hash str
29
+ word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
30
+ end
31
+
32
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
33
+ def clean_word_hash str
34
+ word_hash_for_words str.gsub(/[^\w\s]/,"").split
35
+ end
36
+
37
+ private
38
+
39
+ def word_hash_for_words(words)
40
+ stemmer = Lingua::Stemmer.new(@options)
41
+ d = Hash.new
42
+ skip_words = SKIP_WORDS[@options[:language]] || []
43
+ words.each do |word|
44
+ word = word.mb_chars.downcase.to_s if word =~ /[\w]+/
45
+ key = stemmer.stem(word).intern
46
+ if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2
47
+ d[key] ||= 0
48
+ d[key] += 1
49
+ end
50
+ end
51
+ return d
52
+ end
53
+
54
+ EN_CORPUS_SKIP_WORDS = [
55
+ "a",
56
+ "again",
57
+ "all",
58
+ "along",
59
+ "are",
60
+ "also",
61
+ "an",
62
+ "and",
63
+ "as",
64
+ "at",
65
+ "but",
66
+ "by",
67
+ "came",
68
+ "can",
69
+ "cant",
70
+ "couldnt",
71
+ "did",
72
+ "didn",
73
+ "didnt",
74
+ "do",
75
+ "doesnt",
76
+ "dont",
77
+ "ever",
78
+ "first",
79
+ "from",
80
+ "have",
81
+ "her",
82
+ "here",
83
+ "him",
84
+ "how",
85
+ "i",
86
+ "if",
87
+ "in",
88
+ "into",
89
+ "is",
90
+ "isnt",
91
+ "it",
92
+ "itll",
93
+ "just",
94
+ "last",
95
+ "least",
96
+ "like",
97
+ "most",
98
+ "my",
99
+ "new",
100
+ "no",
101
+ "not",
102
+ "now",
103
+ "of",
104
+ "on",
105
+ "or",
106
+ "should",
107
+ "sinc",
108
+ "so",
109
+ "some",
110
+ "th",
111
+ "than",
112
+ "this",
113
+ "that",
114
+ "the",
115
+ "their",
116
+ "then",
117
+ "those",
118
+ "to",
119
+ "told",
120
+ "too",
121
+ "true",
122
+ "try",
123
+ "until",
124
+ "url",
125
+ "us",
126
+ "were",
127
+ "when",
128
+ "whether",
129
+ "while",
130
+ "with",
131
+ "within",
132
+ "yes",
133
+ "you",
134
+ "youll",
135
+ ]
136
+
137
+ # http://snowball.tartarus.org/algorithms/russian/stop.txt
138
+ RU_CORPUS_SKIP_WORDS = [
139
+ "и", # and
140
+ "в", # in/into
141
+ "во", # alternative form
142
+ "не", # not
143
+ "что", # what/that
144
+ "он", # he
145
+ "на", # on/onto
146
+ "я", # i
147
+ "с", # from
148
+ "со", # alternative form
149
+ "как", # how
150
+ "а", # milder form of `no' (but)
151
+ "то", # conjunction and form of `that'
152
+ "все", # all
153
+ "она", # she
154
+ "так", # so, thus
155
+ "его", # him
156
+ "но", # but
157
+ "да", # yes/and
158
+ "ты", # thou
159
+ "к", # towards, by
160
+ "у", # around, chez
161
+ "же", # intensifier particle
162
+ "вы", # you
163
+ "за", # beyond, behind
164
+ "бы", # conditional/subj. particle
165
+ "по", # up to, along
166
+ "только", # only
167
+ "ее", # her
168
+ "мне", # to me
169
+ "было", # it was
170
+ "вот", # here is/are, particle
171
+ "от", # away from
172
+ "меня", # me
173
+ "еще", # still, yet, more
174
+ "нет", # no, there isnt/arent
175
+ "о", # about
176
+ "из", # out of
177
+ "ему", # to him
178
+ "теперь", # now
179
+ "когда", # when
180
+ "даже", # even
181
+ "ну", # so, well
182
+ "вдруг", # suddenly
183
+ "ли", # interrogative particle
184
+ "если", # if
185
+ "уже", # already, but homonym of `narrower'
186
+ "или", # or
187
+ "ни", # neither
188
+ "быть", # to be
189
+ "был", # he was
190
+ "него", # prepositional form of его
191
+ "до", # up to
192
+ "вас", # you accusative
193
+ "нибудь", # indef. suffix preceded by hyphen
194
+ "опять", # again
195
+ "уж", # already, but homonym of `adder'
196
+ "вам", # to you
197
+ "сказал", # he said
198
+ "ведь", # particle `after all'
199
+ "там", # there
200
+ "потом", # then
201
+ "себя", # oneself
202
+ "ничего", # nothing
203
+ "ей", # to her
204
+ "может", # usually with `быть' as `maybe'
205
+ "они", # they
206
+ "тут", # here
207
+ "где", # where
208
+ "есть", # there is/are
209
+ "надо", # got to, must
210
+ "ней", # prepositional form of ей
211
+ "для", # for
212
+ "мы", # we
213
+ "тебя", # thee
214
+ "их", # them, their
215
+ "чем", # than
216
+ "была", # she was
217
+ "сам", # self
218
+ "чтоб", # in order to
219
+ "без", # without
220
+ "будто", # as if
221
+ "человек", # man, person, one
222
+ "чего", # genitive form of `what'
223
+ "раз", # once
224
+ "тоже", # also
225
+ "себе", # to oneself
226
+ "под", # beneath
227
+ "жизнь", # life
228
+ "будет", # will be
229
+ "ж", # short form of intensifer particle `же'
230
+ "тогда", # then
231
+ "кто", # who
232
+ "этот", # this
233
+ "говорил", # was saying
234
+ "того", # genitive form of `that'
235
+ "потому", # for that reason
236
+ "этого", # genitive form of `this'
237
+ "какой", # which
238
+ "совсем", # altogether
239
+ "ним", # prepositional form of `его', `они'
240
+ "здесь", # here
241
+ "этом", # prepositional form of `этот'
242
+ "один", # one
243
+ "почти", # almost
244
+ "мой", # my
245
+ "тем", # instrumental/dative plural of `тот', `то'
246
+ "чтобы", # full form of `in order that'
247
+ "нее", # her (acc.)
248
+ "кажется", # it seems
249
+ "сейчас", # now
250
+ "были", # they were
251
+ "куда", # where to
252
+ "зачем", # why
253
+ "сказать", # to say
254
+ "всех", # all (acc., gen. preposn. plural)
255
+ "никогда", # never
256
+ "сегодня", # today
257
+ "можно", # possible, one can
258
+ "при", # by
259
+ "наконец", # finally
260
+ "два", # two
261
+ "об", # alternative form of `о', about
262
+ "другой", # another
263
+ "хоть", # even
264
+ "после", # after
265
+ "над", # above
266
+ "больше", # more
267
+ "тот", # that one (masc.)
268
+ "через", # across, in
269
+ "эти", # these
270
+ "нас", # us
271
+ "про", # about
272
+ "всего", # in all, only, of all
273
+ "них", # prepositional form of `они' (they)
274
+ "какая", # which, feminine
275
+ "много", # lots
276
+ "разве", # interrogative particle
277
+ "сказала", # she said
278
+ "три", # three
279
+ "эту", # this, acc. fem. sing.
280
+ "моя", # my, feminine
281
+ "впрочем", # moreover, besides
282
+ "хорошо", # good
283
+ "свою", # ones own, acc. fem. sing.
284
+ "этой", # oblique form of `эта', fem. `this'
285
+ "перед", # in front of
286
+ "иногда", # sometimes
287
+ "лучше", # better
288
+ "чуть", # a little
289
+ "том", # preposn. form of `that one'
290
+ "нельзя", # one must not
291
+ "такой", # such a one
292
+ "им", # to them
293
+ "более", # more
294
+ "всегда", # always
295
+ "конечно", # of course
296
+ "всю", # acc. fem. sing of `all'
297
+ "между", # between
298
+ ]
299
+
300
+ SKIP_WORDS = {
301
+ 'en' => EN_CORPUS_SKIP_WORDS,
302
+ 'ru' => RU_CORPUS_SKIP_WORDS
303
+ }
304
+
305
+ end
306
+ end
@@ -0,0 +1,134 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ class Bayes < Classifier::Base
8
+
9
+ # The class can be created with one or more categories, each of which will be
10
+ # initialized and given a training method. E.g.,
11
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam']
12
+ # you can specify language and encoding parameters for stemmer
13
+ # (default values - :language => 'en', :encoding => 'UTF_8')
14
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam'], :language => 'ru'
15
+ def initialize(options = {})
16
+ @categories = Hash.new
17
+ options.reverse_merge!(:categories => [])
18
+ options[:categories].each { |category| @categories[prepare_category_name(category)] = Hash.new }
19
+ @total_words = 0
20
+ super
21
+ end
22
+
23
+ #
24
+ # Provides a general training method for all categories specified in Bayes#new
25
+ # For example:
26
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
27
+ # b.train :this, "This text"
28
+ # b.train "that", "That text"
29
+ # b.train "The other", "The other text"
30
+ def train(category, text)
31
+ category = prepare_category_name(category)
32
+ word_hash(text).each do |word, count|
33
+ @categories[category][word] ||= 0
34
+ @categories[category][word] += count
35
+ @total_words += count
36
+ end
37
+ end
38
+
39
+ #
40
+ # Provides a untraining method for all categories specified in Bayes#new
41
+ # Be very careful with this method.
42
+ #
43
+ # For example:
44
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
45
+ # b.train :this, "This text"
46
+ # b.untrain :this, "This text"
47
+ def untrain(category, text)
48
+ category = prepare_category_name(category)
49
+ word_hash(text).each do |word, count|
50
+ if @total_words >= 0
51
+ orig = @categories[category][word]
52
+ @categories[category][word] ||= 0
53
+ @categories[category][word] -= count
54
+ if @categories[category][word] <= 0
55
+ @categories[category].delete(word)
56
+ count = orig
57
+ end
58
+ @total_words -= count
59
+ end
60
+ end
61
+ end
62
+
63
+ #
64
+ # Returns the scores in each category the provided +text+. E.g.,
65
+ # b.classifications "I hate bad words and you"
66
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
67
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
68
+ def classifications(text)
69
+ score = Hash.new
70
+ @categories.each do |category, category_words|
71
+ score[category.to_s] = 0
72
+ total = category_words.values.sum
73
+ word_hash(text).each do |word, count|
74
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
75
+ score[category.to_s] += Math.log(s/total.to_f)
76
+ end
77
+ end
78
+ return score
79
+ end
80
+
81
+ #
82
+ # Returns the classification of the provided +text+, which is one of the
83
+ # categories given in the initializer. E.g.,
84
+ # b.classify "I hate bad words and you"
85
+ # => 'Uninteresting'
86
+ def classify(text)
87
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
88
+ end
89
+
90
+ #
91
+ # Provides training and untraining methods for the categories specified in Bayes#new
92
+ # For example:
93
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
94
+ # b.train_this "This text"
95
+ # b.train_that "That text"
96
+ # b.untrain_that "That text"
97
+ # b.train_the_other "The other text"
98
+ def method_missing(name, *args)
99
+ category = prepare_category_name(name.to_s.gsub(/(un)?train_([\w]+)/, '\2'))
100
+ if @categories.has_key? category
101
+ args.each { |text| eval("#{$1}train(category, text)") }
102
+ elsif name.to_s =~ /(un)?train_([\w]+)/
103
+ raise StandardError, "No such category: #{category}"
104
+ else
105
+ super #raise StandardError, "No such method: #{name}"
106
+ end
107
+ end
108
+
109
+ #
110
+ # Provides a list of category names
111
+ # For example:
112
+ # b.categories
113
+ # => ['This', 'That', 'the_other']
114
+ def categories # :nodoc:
115
+ @categories.keys.collect {|c| c.to_s}
116
+ end
117
+
118
+ #
119
+ # Allows you to add categories to the classifier.
120
+ # For example:
121
+ # b.add_category "Not spam"
122
+ #
123
+ # WARNING: Adding categories to a trained classifier will
124
+ # result in an undertrained category that will tend to match
125
+ # more criteria than the trained selective categories. In short,
126
+ # try to initialize your categories at initialization.
127
+ def add_category(category)
128
+ @categories[prepare_category_name(category)] = Hash.new
129
+ end
130
+
131
+ alias append_category add_category
132
+ end
133
+
134
+ end
@@ -0,0 +1,100 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Vector
10
+ def magnitude
11
+ sumsqs = 0.0
12
+ self.size.times do |i|
13
+ sumsqs += self[i] ** 2.0
14
+ end
15
+ Math.sqrt(sumsqs)
16
+ end
17
+ def normalize
18
+ nv = []
19
+ mag = self.magnitude
20
+ self.size.times do |i|
21
+
22
+ nv << (self[i] / mag)
23
+
24
+ end
25
+ Vector[*nv]
26
+ end
27
+ end
28
+
29
+ class Matrix
30
+ def Matrix.diag(s)
31
+ Matrix.diagonal(*s)
32
+ end
33
+
34
+ alias :trans :transpose
35
+
36
+ def SV_decomp(maxSweeps = 20)
37
+ if self.row_size >= self.column_size
38
+ q = self.trans * self
39
+ else
40
+ q = self * self.trans
41
+ end
42
+
43
+ qrot = q.dup
44
+ v = Matrix.identity(q.row_size)
45
+ azrot = nil
46
+ mzrot = nil
47
+ cnt = 0
48
+ s_old = nil
49
+ mu = nil
50
+
51
+ while true do
52
+ cnt += 1
53
+ for row in (0...qrot.row_size-1) do
54
+ for col in (1..qrot.row_size-1) do
55
+ next if row == col
56
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
57
+ hcos = Math.cos(h)
58
+ hsin = Math.sin(h)
59
+ mzrot = Matrix.identity(qrot.row_size)
60
+ mzrot[row,row] = hcos
61
+ mzrot[row,col] = -hsin
62
+ mzrot[col,row] = hsin
63
+ mzrot[col,col] = hcos
64
+ qrot = mzrot.trans * qrot * mzrot
65
+ v = v * mzrot
66
+ end
67
+ end
68
+ s_old = qrot.dup if cnt == 1
69
+ sum_qrot = 0.0
70
+ if cnt > 1
71
+ qrot.row_size.times do |r|
72
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
73
+ end
74
+ s_old = qrot.dup
75
+ end
76
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
77
+ end # of do while true
78
+ s = []
79
+ qrot.row_size.times do |r|
80
+ s << Math.sqrt(qrot[r,r])
81
+ end
82
+ #puts "cnt = #{cnt}"
83
+ if self.row_size >= self.column_size
84
+ mu = self * v * Matrix.diagonal(*s).inverse
85
+ return [mu, v, s]
86
+ else
87
+ puts v.row_size
88
+ puts v.column_size
89
+ puts self.row_size
90
+ puts self.column_size
91
+ puts s.size
92
+
93
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
94
+ return [mu, v, s]
95
+ end
96
+ end
97
+ def []=(i,j,val)
98
+ @rows[i][j] = val
99
+ end
100
+ end
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,73 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ # This is an internal data structure class for the LSI node. Save for
8
+ # raw_vector_with, it should be fairly straightforward to understand.
9
+ # You should never have to use it directly.
10
+ class ContentNode
11
+ attr_accessor :raw_vector, :raw_norm,
12
+ :lsi_vector, :lsi_norm,
13
+ :categories
14
+
15
+ attr_reader :word_hash
16
+ # If text_proc is not specified, the source will be duck-typed
17
+ # via source.to_s
18
+ def initialize( word_hash, *categories )
19
+ @categories = categories || []
20
+ @word_hash = word_hash
21
+ end
22
+
23
+ # Use this to fetch the appropriate search vector.
24
+ def search_vector
25
+ @lsi_vector || @raw_vector
26
+ end
27
+
28
+ # Use this to fetch the appropriate search vector in normalized form.
29
+ def search_norm
30
+ @lsi_norm || @raw_norm
31
+ end
32
+
33
+ # Creates the raw vector out of word_hash using word_list as the
34
+ # key for mapping the vector space.
35
+ def raw_vector_with( word_list )
36
+ if $GSL
37
+ vec = GSL::Vector.alloc(word_list.size)
38
+ else
39
+ vec = Array.new(word_list.size, 0)
40
+ end
41
+
42
+ @word_hash.each_key do |word|
43
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
44
+ end
45
+
46
+ # Perform the scaling transform
47
+ total_words = vec.sum.to_f
48
+
49
+ # Perform first-order association transform if this vector has more
50
+ # than one word in it.
51
+ if total_words > 1.0
52
+ weighted_total = 0.0
53
+ vec.each do |term|
54
+ if ( term > 0 )
55
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
+ end
57
+ end
58
+ weighted_total = -1.0 if weighted_total.zero? # if no word in list is known
59
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
60
+ end
61
+
62
+ if $GSL
63
+ @raw_norm = vec.normalize
64
+ @raw_vector = vec
65
+ else
66
+ @raw_norm = Vector[*vec].normalize
67
+ @raw_vector = Vector[*vec]
68
+ end
69
+ end
70
+
71
+ end
72
+
73
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ class String
6
+ def summary( count=10, separator=" [...] " )
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
+
10
+ def paragraph_summary( count=1, separator=" [...] " )
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
+
14
+ def split_sentences
15
+ split /(\.|\!|\?)/ # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new :auto_rebuild => false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+
9
+ class WordList
10
+ def initialize
11
+ @location_table = Hash.new
12
+ end
13
+
14
+ # Adds a word (if it is new) and assigns it a unique dimension.
15
+ def add_word(word)
16
+ term = word
17
+ @location_table[term] = @location_table.size unless @location_table[term]
18
+ end
19
+
20
+ # Returns the dimension of the word or nil if the word is not in the space.
21
+ def [](lookup)
22
+ term = lookup
23
+ @location_table[term]
24
+ end
25
+
26
+ def word_for_index(ind)
27
+ @location_table.invert[ind]
28
+ end
29
+
30
+ # Returns the number of words mapped.
31
+ def size
32
+ @location_table.size
33
+ end
34
+
35
+ end
36
+ end