noctivityinc-classifier191 1.3.5
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +429 -0
- data/Manifest +19 -0
- data/README +86 -0
- data/Rakefile +15 -0
- data/classifier.gemspec +38 -0
- data/lib/classifier/base.rb +306 -0
- data/lib/classifier/bayes.rb +134 -0
- data/lib/classifier/extensions/vector.rb +100 -0
- data/lib/classifier/extensions/vector_serialize.rb +20 -0
- data/lib/classifier/lsi/content_node.rb +73 -0
- data/lib/classifier/lsi/summary.rb +31 -0
- data/lib/classifier/lsi/word_list.rb +36 -0
- data/lib/classifier/lsi.rb +337 -0
- data/lib/classifier.rb +32 -0
- data/lib/init.rb +1 -0
- data/test/base_test.rb +17 -0
- data/test/bayes/bayesian_test.rb +52 -0
- data/test/lsi/lsi_test.rb +167 -0
- data/test/test_helper.rb +4 -0
- metadata +111 -0
@@ -0,0 +1,306 @@
|
|
1
|
+
# coding:utf-8
|
2
|
+
# $KCODE = 'utf8'
|
3
|
+
|
4
|
+
module Classifier
|
5
|
+
class Base
|
6
|
+
|
7
|
+
def initialize(options = {})
|
8
|
+
options.reverse_merge!(:language => 'en')
|
9
|
+
options.reverse_merge!(:encoding => 'UTF_8')
|
10
|
+
|
11
|
+
@options = options
|
12
|
+
end
|
13
|
+
|
14
|
+
def prepare_category_name val
|
15
|
+
val.to_s.gsub("_"," ").capitalize.intern
|
16
|
+
end
|
17
|
+
|
18
|
+
# Removes common punctuation symbols, returning a new string.
|
19
|
+
# E.g.,
|
20
|
+
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
21
|
+
# => "Hello greetings with braces "
|
22
|
+
def without_punctuation str
|
23
|
+
str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
|
24
|
+
end
|
25
|
+
|
26
|
+
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
27
|
+
# interned, and indexes to its frequency in the document.
|
28
|
+
def word_hash str
|
29
|
+
word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
33
|
+
def clean_word_hash str
|
34
|
+
word_hash_for_words str.gsub(/[^\w\s]/,"").split
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def word_hash_for_words(words)
|
40
|
+
stemmer = Lingua::Stemmer.new(@options)
|
41
|
+
d = Hash.new
|
42
|
+
skip_words = SKIP_WORDS[@options[:language]] || []
|
43
|
+
words.each do |word|
|
44
|
+
word = word.mb_chars.downcase.to_s if word =~ /[\w]+/
|
45
|
+
key = stemmer.stem(word).intern
|
46
|
+
if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2
|
47
|
+
d[key] ||= 0
|
48
|
+
d[key] += 1
|
49
|
+
end
|
50
|
+
end
|
51
|
+
return d
|
52
|
+
end
|
53
|
+
|
54
|
+
EN_CORPUS_SKIP_WORDS = [
|
55
|
+
"a",
|
56
|
+
"again",
|
57
|
+
"all",
|
58
|
+
"along",
|
59
|
+
"are",
|
60
|
+
"also",
|
61
|
+
"an",
|
62
|
+
"and",
|
63
|
+
"as",
|
64
|
+
"at",
|
65
|
+
"but",
|
66
|
+
"by",
|
67
|
+
"came",
|
68
|
+
"can",
|
69
|
+
"cant",
|
70
|
+
"couldnt",
|
71
|
+
"did",
|
72
|
+
"didn",
|
73
|
+
"didnt",
|
74
|
+
"do",
|
75
|
+
"doesnt",
|
76
|
+
"dont",
|
77
|
+
"ever",
|
78
|
+
"first",
|
79
|
+
"from",
|
80
|
+
"have",
|
81
|
+
"her",
|
82
|
+
"here",
|
83
|
+
"him",
|
84
|
+
"how",
|
85
|
+
"i",
|
86
|
+
"if",
|
87
|
+
"in",
|
88
|
+
"into",
|
89
|
+
"is",
|
90
|
+
"isnt",
|
91
|
+
"it",
|
92
|
+
"itll",
|
93
|
+
"just",
|
94
|
+
"last",
|
95
|
+
"least",
|
96
|
+
"like",
|
97
|
+
"most",
|
98
|
+
"my",
|
99
|
+
"new",
|
100
|
+
"no",
|
101
|
+
"not",
|
102
|
+
"now",
|
103
|
+
"of",
|
104
|
+
"on",
|
105
|
+
"or",
|
106
|
+
"should",
|
107
|
+
"sinc",
|
108
|
+
"so",
|
109
|
+
"some",
|
110
|
+
"th",
|
111
|
+
"than",
|
112
|
+
"this",
|
113
|
+
"that",
|
114
|
+
"the",
|
115
|
+
"their",
|
116
|
+
"then",
|
117
|
+
"those",
|
118
|
+
"to",
|
119
|
+
"told",
|
120
|
+
"too",
|
121
|
+
"true",
|
122
|
+
"try",
|
123
|
+
"until",
|
124
|
+
"url",
|
125
|
+
"us",
|
126
|
+
"were",
|
127
|
+
"when",
|
128
|
+
"whether",
|
129
|
+
"while",
|
130
|
+
"with",
|
131
|
+
"within",
|
132
|
+
"yes",
|
133
|
+
"you",
|
134
|
+
"youll",
|
135
|
+
]
|
136
|
+
|
137
|
+
# http://snowball.tartarus.org/algorithms/russian/stop.txt
|
138
|
+
RU_CORPUS_SKIP_WORDS = [
|
139
|
+
"и", # and
|
140
|
+
"в", # in/into
|
141
|
+
"во", # alternative form
|
142
|
+
"не", # not
|
143
|
+
"что", # what/that
|
144
|
+
"он", # he
|
145
|
+
"на", # on/onto
|
146
|
+
"я", # i
|
147
|
+
"с", # from
|
148
|
+
"со", # alternative form
|
149
|
+
"как", # how
|
150
|
+
"а", # milder form of `no' (but)
|
151
|
+
"то", # conjunction and form of `that'
|
152
|
+
"все", # all
|
153
|
+
"она", # she
|
154
|
+
"так", # so, thus
|
155
|
+
"его", # him
|
156
|
+
"но", # but
|
157
|
+
"да", # yes/and
|
158
|
+
"ты", # thou
|
159
|
+
"к", # towards, by
|
160
|
+
"у", # around, chez
|
161
|
+
"же", # intensifier particle
|
162
|
+
"вы", # you
|
163
|
+
"за", # beyond, behind
|
164
|
+
"бы", # conditional/subj. particle
|
165
|
+
"по", # up to, along
|
166
|
+
"только", # only
|
167
|
+
"ее", # her
|
168
|
+
"мне", # to me
|
169
|
+
"было", # it was
|
170
|
+
"вот", # here is/are, particle
|
171
|
+
"от", # away from
|
172
|
+
"меня", # me
|
173
|
+
"еще", # still, yet, more
|
174
|
+
"нет", # no, there isnt/arent
|
175
|
+
"о", # about
|
176
|
+
"из", # out of
|
177
|
+
"ему", # to him
|
178
|
+
"теперь", # now
|
179
|
+
"когда", # when
|
180
|
+
"даже", # even
|
181
|
+
"ну", # so, well
|
182
|
+
"вдруг", # suddenly
|
183
|
+
"ли", # interrogative particle
|
184
|
+
"если", # if
|
185
|
+
"уже", # already, but homonym of `narrower'
|
186
|
+
"или", # or
|
187
|
+
"ни", # neither
|
188
|
+
"быть", # to be
|
189
|
+
"был", # he was
|
190
|
+
"него", # prepositional form of его
|
191
|
+
"до", # up to
|
192
|
+
"вас", # you accusative
|
193
|
+
"нибудь", # indef. suffix preceded by hyphen
|
194
|
+
"опять", # again
|
195
|
+
"уж", # already, but homonym of `adder'
|
196
|
+
"вам", # to you
|
197
|
+
"сказал", # he said
|
198
|
+
"ведь", # particle `after all'
|
199
|
+
"там", # there
|
200
|
+
"потом", # then
|
201
|
+
"себя", # oneself
|
202
|
+
"ничего", # nothing
|
203
|
+
"ей", # to her
|
204
|
+
"может", # usually with `быть' as `maybe'
|
205
|
+
"они", # they
|
206
|
+
"тут", # here
|
207
|
+
"где", # where
|
208
|
+
"есть", # there is/are
|
209
|
+
"надо", # got to, must
|
210
|
+
"ней", # prepositional form of ей
|
211
|
+
"для", # for
|
212
|
+
"мы", # we
|
213
|
+
"тебя", # thee
|
214
|
+
"их", # them, their
|
215
|
+
"чем", # than
|
216
|
+
"была", # she was
|
217
|
+
"сам", # self
|
218
|
+
"чтоб", # in order to
|
219
|
+
"без", # without
|
220
|
+
"будто", # as if
|
221
|
+
"человек", # man, person, one
|
222
|
+
"чего", # genitive form of `what'
|
223
|
+
"раз", # once
|
224
|
+
"тоже", # also
|
225
|
+
"себе", # to oneself
|
226
|
+
"под", # beneath
|
227
|
+
"жизнь", # life
|
228
|
+
"будет", # will be
|
229
|
+
"ж", # short form of intensifer particle `же'
|
230
|
+
"тогда", # then
|
231
|
+
"кто", # who
|
232
|
+
"этот", # this
|
233
|
+
"говорил", # was saying
|
234
|
+
"того", # genitive form of `that'
|
235
|
+
"потому", # for that reason
|
236
|
+
"этого", # genitive form of `this'
|
237
|
+
"какой", # which
|
238
|
+
"совсем", # altogether
|
239
|
+
"ним", # prepositional form of `его', `они'
|
240
|
+
"здесь", # here
|
241
|
+
"этом", # prepositional form of `этот'
|
242
|
+
"один", # one
|
243
|
+
"почти", # almost
|
244
|
+
"мой", # my
|
245
|
+
"тем", # instrumental/dative plural of `тот', `то'
|
246
|
+
"чтобы", # full form of `in order that'
|
247
|
+
"нее", # her (acc.)
|
248
|
+
"кажется", # it seems
|
249
|
+
"сейчас", # now
|
250
|
+
"были", # they were
|
251
|
+
"куда", # where to
|
252
|
+
"зачем", # why
|
253
|
+
"сказать", # to say
|
254
|
+
"всех", # all (acc., gen. preposn. plural)
|
255
|
+
"никогда", # never
|
256
|
+
"сегодня", # today
|
257
|
+
"можно", # possible, one can
|
258
|
+
"при", # by
|
259
|
+
"наконец", # finally
|
260
|
+
"два", # two
|
261
|
+
"об", # alternative form of `о', about
|
262
|
+
"другой", # another
|
263
|
+
"хоть", # even
|
264
|
+
"после", # after
|
265
|
+
"над", # above
|
266
|
+
"больше", # more
|
267
|
+
"тот", # that one (masc.)
|
268
|
+
"через", # across, in
|
269
|
+
"эти", # these
|
270
|
+
"нас", # us
|
271
|
+
"про", # about
|
272
|
+
"всего", # in all, only, of all
|
273
|
+
"них", # prepositional form of `они' (they)
|
274
|
+
"какая", # which, feminine
|
275
|
+
"много", # lots
|
276
|
+
"разве", # interrogative particle
|
277
|
+
"сказала", # she said
|
278
|
+
"три", # three
|
279
|
+
"эту", # this, acc. fem. sing.
|
280
|
+
"моя", # my, feminine
|
281
|
+
"впрочем", # moreover, besides
|
282
|
+
"хорошо", # good
|
283
|
+
"свою", # ones own, acc. fem. sing.
|
284
|
+
"этой", # oblique form of `эта', fem. `this'
|
285
|
+
"перед", # in front of
|
286
|
+
"иногда", # sometimes
|
287
|
+
"лучше", # better
|
288
|
+
"чуть", # a little
|
289
|
+
"том", # preposn. form of `that one'
|
290
|
+
"нельзя", # one must not
|
291
|
+
"такой", # such a one
|
292
|
+
"им", # to them
|
293
|
+
"более", # more
|
294
|
+
"всегда", # always
|
295
|
+
"конечно", # of course
|
296
|
+
"всю", # acc. fem. sing of `all'
|
297
|
+
"между", # between
|
298
|
+
]
|
299
|
+
|
300
|
+
SKIP_WORDS = {
|
301
|
+
'en' => EN_CORPUS_SKIP_WORDS,
|
302
|
+
'ru' => RU_CORPUS_SKIP_WORDS
|
303
|
+
}
|
304
|
+
|
305
|
+
end
|
306
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
module Classifier
|
6
|
+
|
7
|
+
class Bayes < Classifier::Base
|
8
|
+
|
9
|
+
# The class can be created with one or more categories, each of which will be
|
10
|
+
# initialized and given a training method. E.g.,
|
11
|
+
# b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam']
|
12
|
+
# you can specify language and encoding parameters for stemmer
|
13
|
+
# (default values - :language => 'en', :encoding => 'UTF_8')
|
14
|
+
# b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam'], :language => 'ru'
|
15
|
+
def initialize(options = {})
|
16
|
+
@categories = Hash.new
|
17
|
+
options.reverse_merge!(:categories => [])
|
18
|
+
options[:categories].each { |category| @categories[prepare_category_name(category)] = Hash.new }
|
19
|
+
@total_words = 0
|
20
|
+
super
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# Provides a general training method for all categories specified in Bayes#new
|
25
|
+
# For example:
|
26
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
27
|
+
# b.train :this, "This text"
|
28
|
+
# b.train "that", "That text"
|
29
|
+
# b.train "The other", "The other text"
|
30
|
+
def train(category, text)
|
31
|
+
category = prepare_category_name(category)
|
32
|
+
word_hash(text).each do |word, count|
|
33
|
+
@categories[category][word] ||= 0
|
34
|
+
@categories[category][word] += count
|
35
|
+
@total_words += count
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Provides a untraining method for all categories specified in Bayes#new
|
41
|
+
# Be very careful with this method.
|
42
|
+
#
|
43
|
+
# For example:
|
44
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
45
|
+
# b.train :this, "This text"
|
46
|
+
# b.untrain :this, "This text"
|
47
|
+
def untrain(category, text)
|
48
|
+
category = prepare_category_name(category)
|
49
|
+
word_hash(text).each do |word, count|
|
50
|
+
if @total_words >= 0
|
51
|
+
orig = @categories[category][word]
|
52
|
+
@categories[category][word] ||= 0
|
53
|
+
@categories[category][word] -= count
|
54
|
+
if @categories[category][word] <= 0
|
55
|
+
@categories[category].delete(word)
|
56
|
+
count = orig
|
57
|
+
end
|
58
|
+
@total_words -= count
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
#
|
64
|
+
# Returns the scores in each category the provided +text+. E.g.,
|
65
|
+
# b.classifications "I hate bad words and you"
|
66
|
+
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
67
|
+
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
68
|
+
def classifications(text)
|
69
|
+
score = Hash.new
|
70
|
+
@categories.each do |category, category_words|
|
71
|
+
score[category.to_s] = 0
|
72
|
+
total = category_words.values.sum
|
73
|
+
word_hash(text).each do |word, count|
|
74
|
+
s = category_words.has_key?(word) ? category_words[word] : 0.1
|
75
|
+
score[category.to_s] += Math.log(s/total.to_f)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
return score
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# Returns the classification of the provided +text+, which is one of the
|
83
|
+
# categories given in the initializer. E.g.,
|
84
|
+
# b.classify "I hate bad words and you"
|
85
|
+
# => 'Uninteresting'
|
86
|
+
def classify(text)
|
87
|
+
(classifications(text).sort_by { |a| -a[1] })[0][0]
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# Provides training and untraining methods for the categories specified in Bayes#new
|
92
|
+
# For example:
|
93
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
94
|
+
# b.train_this "This text"
|
95
|
+
# b.train_that "That text"
|
96
|
+
# b.untrain_that "That text"
|
97
|
+
# b.train_the_other "The other text"
|
98
|
+
def method_missing(name, *args)
|
99
|
+
category = prepare_category_name(name.to_s.gsub(/(un)?train_([\w]+)/, '\2'))
|
100
|
+
if @categories.has_key? category
|
101
|
+
args.each { |text| eval("#{$1}train(category, text)") }
|
102
|
+
elsif name.to_s =~ /(un)?train_([\w]+)/
|
103
|
+
raise StandardError, "No such category: #{category}"
|
104
|
+
else
|
105
|
+
super #raise StandardError, "No such method: #{name}"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
#
|
110
|
+
# Provides a list of category names
|
111
|
+
# For example:
|
112
|
+
# b.categories
|
113
|
+
# => ['This', 'That', 'the_other']
|
114
|
+
def categories # :nodoc:
|
115
|
+
@categories.keys.collect {|c| c.to_s}
|
116
|
+
end
|
117
|
+
|
118
|
+
#
|
119
|
+
# Allows you to add categories to the classifier.
|
120
|
+
# For example:
|
121
|
+
# b.add_category "Not spam"
|
122
|
+
#
|
123
|
+
# WARNING: Adding categories to a trained classifier will
|
124
|
+
# result in an undertrained category that will tend to match
|
125
|
+
# more criteria than the trained selective categories. In short,
|
126
|
+
# try to initialize your categories at initialization.
|
127
|
+
def add_category(category)
|
128
|
+
@categories[prepare_category_name(category)] = Hash.new
|
129
|
+
end
|
130
|
+
|
131
|
+
alias append_category add_category
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# Author:: Ernest Ellingson
|
2
|
+
# Copyright:: Copyright (c) 2005
|
3
|
+
|
4
|
+
# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
|
5
|
+
|
6
|
+
require 'matrix'
|
7
|
+
require 'mathn'
|
8
|
+
|
9
|
+
class Vector
|
10
|
+
def magnitude
|
11
|
+
sumsqs = 0.0
|
12
|
+
self.size.times do |i|
|
13
|
+
sumsqs += self[i] ** 2.0
|
14
|
+
end
|
15
|
+
Math.sqrt(sumsqs)
|
16
|
+
end
|
17
|
+
def normalize
|
18
|
+
nv = []
|
19
|
+
mag = self.magnitude
|
20
|
+
self.size.times do |i|
|
21
|
+
|
22
|
+
nv << (self[i] / mag)
|
23
|
+
|
24
|
+
end
|
25
|
+
Vector[*nv]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class Matrix
|
30
|
+
def Matrix.diag(s)
|
31
|
+
Matrix.diagonal(*s)
|
32
|
+
end
|
33
|
+
|
34
|
+
alias :trans :transpose
|
35
|
+
|
36
|
+
def SV_decomp(maxSweeps = 20)
|
37
|
+
if self.row_size >= self.column_size
|
38
|
+
q = self.trans * self
|
39
|
+
else
|
40
|
+
q = self * self.trans
|
41
|
+
end
|
42
|
+
|
43
|
+
qrot = q.dup
|
44
|
+
v = Matrix.identity(q.row_size)
|
45
|
+
azrot = nil
|
46
|
+
mzrot = nil
|
47
|
+
cnt = 0
|
48
|
+
s_old = nil
|
49
|
+
mu = nil
|
50
|
+
|
51
|
+
while true do
|
52
|
+
cnt += 1
|
53
|
+
for row in (0...qrot.row_size-1) do
|
54
|
+
for col in (1..qrot.row_size-1) do
|
55
|
+
next if row == col
|
56
|
+
h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
|
57
|
+
hcos = Math.cos(h)
|
58
|
+
hsin = Math.sin(h)
|
59
|
+
mzrot = Matrix.identity(qrot.row_size)
|
60
|
+
mzrot[row,row] = hcos
|
61
|
+
mzrot[row,col] = -hsin
|
62
|
+
mzrot[col,row] = hsin
|
63
|
+
mzrot[col,col] = hcos
|
64
|
+
qrot = mzrot.trans * qrot * mzrot
|
65
|
+
v = v * mzrot
|
66
|
+
end
|
67
|
+
end
|
68
|
+
s_old = qrot.dup if cnt == 1
|
69
|
+
sum_qrot = 0.0
|
70
|
+
if cnt > 1
|
71
|
+
qrot.row_size.times do |r|
|
72
|
+
sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
|
73
|
+
end
|
74
|
+
s_old = qrot.dup
|
75
|
+
end
|
76
|
+
break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
|
77
|
+
end # of do while true
|
78
|
+
s = []
|
79
|
+
qrot.row_size.times do |r|
|
80
|
+
s << Math.sqrt(qrot[r,r])
|
81
|
+
end
|
82
|
+
#puts "cnt = #{cnt}"
|
83
|
+
if self.row_size >= self.column_size
|
84
|
+
mu = self * v * Matrix.diagonal(*s).inverse
|
85
|
+
return [mu, v, s]
|
86
|
+
else
|
87
|
+
puts v.row_size
|
88
|
+
puts v.column_size
|
89
|
+
puts self.row_size
|
90
|
+
puts self.column_size
|
91
|
+
puts s.size
|
92
|
+
|
93
|
+
mu = (self.trans * v * Matrix.diagonal(*s).inverse)
|
94
|
+
return [mu, v, s]
|
95
|
+
end
|
96
|
+
end
|
97
|
+
def []=(i,j,val)
|
98
|
+
@rows[i][j] = val
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module GSL
|
2
|
+
|
3
|
+
class Vector
|
4
|
+
def _dump(v)
|
5
|
+
Marshal.dump( self.to_a )
|
6
|
+
end
|
7
|
+
|
8
|
+
def self._load(arr)
|
9
|
+
arry = Marshal.load(arr)
|
10
|
+
return GSL::Vector.alloc(arry)
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
class Matrix
|
16
|
+
class <<self
|
17
|
+
alias :diag :diagonal
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
|
+
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
module Classifier
|
6
|
+
|
7
|
+
# This is an internal data structure class for the LSI node. Save for
|
8
|
+
# raw_vector_with, it should be fairly straightforward to understand.
|
9
|
+
# You should never have to use it directly.
|
10
|
+
class ContentNode
|
11
|
+
attr_accessor :raw_vector, :raw_norm,
|
12
|
+
:lsi_vector, :lsi_norm,
|
13
|
+
:categories
|
14
|
+
|
15
|
+
attr_reader :word_hash
|
16
|
+
# If text_proc is not specified, the source will be duck-typed
|
17
|
+
# via source.to_s
|
18
|
+
def initialize( word_hash, *categories )
|
19
|
+
@categories = categories || []
|
20
|
+
@word_hash = word_hash
|
21
|
+
end
|
22
|
+
|
23
|
+
# Use this to fetch the appropriate search vector.
|
24
|
+
def search_vector
|
25
|
+
@lsi_vector || @raw_vector
|
26
|
+
end
|
27
|
+
|
28
|
+
# Use this to fetch the appropriate search vector in normalized form.
|
29
|
+
def search_norm
|
30
|
+
@lsi_norm || @raw_norm
|
31
|
+
end
|
32
|
+
|
33
|
+
# Creates the raw vector out of word_hash using word_list as the
|
34
|
+
# key for mapping the vector space.
|
35
|
+
def raw_vector_with( word_list )
|
36
|
+
if $GSL
|
37
|
+
vec = GSL::Vector.alloc(word_list.size)
|
38
|
+
else
|
39
|
+
vec = Array.new(word_list.size, 0)
|
40
|
+
end
|
41
|
+
|
42
|
+
@word_hash.each_key do |word|
|
43
|
+
vec[word_list[word]] = @word_hash[word] if word_list[word]
|
44
|
+
end
|
45
|
+
|
46
|
+
# Perform the scaling transform
|
47
|
+
total_words = vec.sum.to_f
|
48
|
+
|
49
|
+
# Perform first-order association transform if this vector has more
|
50
|
+
# than one word in it.
|
51
|
+
if total_words > 1.0
|
52
|
+
weighted_total = 0.0
|
53
|
+
vec.each do |term|
|
54
|
+
if ( term > 0 )
|
55
|
+
weighted_total += (( term / total_words ) * Math.log( term / total_words ))
|
56
|
+
end
|
57
|
+
end
|
58
|
+
weighted_total = -1.0 if weighted_total.zero? # if no word in list is known
|
59
|
+
vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
|
60
|
+
end
|
61
|
+
|
62
|
+
if $GSL
|
63
|
+
@raw_norm = vec.normalize
|
64
|
+
@raw_vector = vec
|
65
|
+
else
|
66
|
+
@raw_norm = Vector[*vec].normalize
|
67
|
+
@raw_vector = Vector[*vec]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
class String
|
6
|
+
def summary( count=10, separator=" [...] " )
|
7
|
+
perform_lsi split_sentences, count, separator
|
8
|
+
end
|
9
|
+
|
10
|
+
def paragraph_summary( count=1, separator=" [...] " )
|
11
|
+
perform_lsi split_paragraphs, count, separator
|
12
|
+
end
|
13
|
+
|
14
|
+
def split_sentences
|
15
|
+
split /(\.|\!|\?)/ # TODO: make this less primitive
|
16
|
+
end
|
17
|
+
|
18
|
+
def split_paragraphs
|
19
|
+
split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def perform_lsi(chunks, count, separator)
|
25
|
+
lsi = Classifier::LSI.new :auto_rebuild => false
|
26
|
+
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
|
27
|
+
lsi.build_index
|
28
|
+
summaries = lsi.highest_relative_content count
|
29
|
+
return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
|
+
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
module Classifier
|
6
|
+
# This class keeps a word => index mapping. It is used to map stemmed words
|
7
|
+
# to dimensions of a vector.
|
8
|
+
|
9
|
+
class WordList
|
10
|
+
def initialize
|
11
|
+
@location_table = Hash.new
|
12
|
+
end
|
13
|
+
|
14
|
+
# Adds a word (if it is new) and assigns it a unique dimension.
|
15
|
+
def add_word(word)
|
16
|
+
term = word
|
17
|
+
@location_table[term] = @location_table.size unless @location_table[term]
|
18
|
+
end
|
19
|
+
|
20
|
+
# Returns the dimension of the word or nil if the word is not in the space.
|
21
|
+
def [](lookup)
|
22
|
+
term = lookup
|
23
|
+
@location_table[term]
|
24
|
+
end
|
25
|
+
|
26
|
+
def word_for_index(ind)
|
27
|
+
@location_table.invert[ind]
|
28
|
+
end
|
29
|
+
|
30
|
+
# Returns the number of words mapped.
|
31
|
+
def size
|
32
|
+
@location_table.size
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|