logankoester-classifier 1.4.3
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +429 -0
- data/Manifest +19 -0
- data/README.rdoc +124 -0
- data/Rakefile +21 -0
- data/VERSION.yml +5 -0
- data/lib/classifier.rb +31 -0
- data/lib/classifier/base.rb +65 -0
- data/lib/classifier/bayes.rb +145 -0
- data/lib/classifier/extensions/vector.rb +100 -0
- data/lib/classifier/extensions/vector_serialize.rb +20 -0
- data/lib/classifier/lsi.rb +348 -0
- data/lib/classifier/lsi/content_node.rb +73 -0
- data/lib/classifier/lsi/summary.rb +31 -0
- data/lib/classifier/lsi/word_list.rb +36 -0
- data/lib/classifier/stopwords.rb +42 -0
- data/lib/classifier/stopwords/en +82 -0
- data/lib/classifier/stopwords/es +339 -0
- data/lib/classifier/stopwords/ru +161 -0
- data/lib/init.rb +1 -0
- data/tasks/test.rake +6 -0
- data/test/base_test.rb +17 -0
- data/test/bayes/bayesian_test.rb +68 -0
- data/test/lsi/lsi_test.rb +167 -0
- data/test/stopwords_test.rb +38 -0
- data/test/test_helper.rb +4 -0
- metadata +127 -0
@@ -0,0 +1,73 @@
|
|
1
|
+
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
|
+
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
module Classifier
|
6
|
+
|
7
|
+
# This is an internal data structure class for the LSI node. Save for
|
8
|
+
# raw_vector_with, it should be fairly straightforward to understand.
|
9
|
+
# You should never have to use it directly.
|
10
|
+
class ContentNode
|
11
|
+
attr_accessor :raw_vector, :raw_norm,
|
12
|
+
:lsi_vector, :lsi_norm,
|
13
|
+
:categories
|
14
|
+
|
15
|
+
attr_reader :word_hash
|
16
|
+
# If text_proc is not specified, the source will be duck-typed
|
17
|
+
# via source.to_s
|
18
|
+
def initialize( word_hash, *categories )
|
19
|
+
@categories = categories || []
|
20
|
+
@word_hash = word_hash
|
21
|
+
end
|
22
|
+
|
23
|
+
# Use this to fetch the appropriate search vector.
|
24
|
+
def search_vector
|
25
|
+
@lsi_vector || @raw_vector
|
26
|
+
end
|
27
|
+
|
28
|
+
# Use this to fetch the appropriate search vector in normalized form.
|
29
|
+
def search_norm
|
30
|
+
@lsi_norm || @raw_norm
|
31
|
+
end
|
32
|
+
|
33
|
+
# Creates the raw vector out of word_hash using word_list as the
|
34
|
+
# key for mapping the vector space.
|
35
|
+
def raw_vector_with( word_list )
|
36
|
+
if $GSL
|
37
|
+
vec = GSL::Vector.alloc(word_list.size)
|
38
|
+
else
|
39
|
+
vec = Array.new(word_list.size, 0)
|
40
|
+
end
|
41
|
+
|
42
|
+
@word_hash.each_key do |word|
|
43
|
+
vec[word_list[word]] = @word_hash[word] if word_list[word]
|
44
|
+
end
|
45
|
+
|
46
|
+
# Perform the scaling transform
|
47
|
+
total_words = vec.sum.to_f
|
48
|
+
|
49
|
+
# Perform first-order association transform if this vector has more
|
50
|
+
# than one word in it.
|
51
|
+
if total_words > 1.0
|
52
|
+
weighted_total = 0.0
|
53
|
+
vec.each do |term|
|
54
|
+
if ( term > 0 )
|
55
|
+
weighted_total += (( term / total_words ) * Math.log( term / total_words ))
|
56
|
+
end
|
57
|
+
end
|
58
|
+
weighted_total = -1.0 if weighted_total.zero? # if no word in list is known
|
59
|
+
vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
|
60
|
+
end
|
61
|
+
|
62
|
+
if $GSL
|
63
|
+
@raw_norm = vec.normalize
|
64
|
+
@raw_vector = vec
|
65
|
+
else
|
66
|
+
@raw_norm = Vector[*vec].normalize
|
67
|
+
@raw_vector = Vector[*vec]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
class String
|
6
|
+
def summary( count=10, separator=" [...] " )
|
7
|
+
perform_lsi split_sentences, count, separator
|
8
|
+
end
|
9
|
+
|
10
|
+
def paragraph_summary( count=1, separator=" [...] " )
|
11
|
+
perform_lsi split_paragraphs, count, separator
|
12
|
+
end
|
13
|
+
|
14
|
+
def split_sentences
|
15
|
+
split /(\.|\!|\?)/ # TODO: make this less primitive
|
16
|
+
end
|
17
|
+
|
18
|
+
def split_paragraphs
|
19
|
+
split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def perform_lsi(chunks, count, separator)
|
25
|
+
lsi = Classifier::LSI.new :auto_rebuild => false
|
26
|
+
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
|
27
|
+
lsi.build_index
|
28
|
+
summaries = lsi.highest_relative_content count
|
29
|
+
return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
|
+
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
module Classifier
|
6
|
+
# This class keeps a word => index mapping. It is used to map stemmed words
|
7
|
+
# to dimensions of a vector.
|
8
|
+
|
9
|
+
class WordList
|
10
|
+
def initialize
|
11
|
+
@location_table = Hash.new
|
12
|
+
end
|
13
|
+
|
14
|
+
# Adds a word (if it is new) and assigns it a unique dimension.
|
15
|
+
def add_word(word)
|
16
|
+
term = word
|
17
|
+
@location_table[term] = @location_table.size unless @location_table[term]
|
18
|
+
end
|
19
|
+
|
20
|
+
# Returns the dimension of the word or nil if the word is not in the space.
|
21
|
+
def [](lookup)
|
22
|
+
term = lookup
|
23
|
+
@location_table[term]
|
24
|
+
end
|
25
|
+
|
26
|
+
def word_for_index(ind)
|
27
|
+
@location_table.invert[ind]
|
28
|
+
end
|
29
|
+
|
30
|
+
# Returns the number of words mapped.
|
31
|
+
def size
|
32
|
+
@location_table.size
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Classifier
|
2
|
+
|
3
|
+
module StopWords
|
4
|
+
|
5
|
+
def self.for(language, lang_dir=nil)
|
6
|
+
unless STOP_WORDS.has_key?(language)
|
7
|
+
STOP_WORDS[language] = load_stopwords(language, lang_dir) || []
|
8
|
+
end
|
9
|
+
STOP_WORDS[language]
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.reset
|
13
|
+
STOP_WORDS.clear
|
14
|
+
end
|
15
|
+
|
16
|
+
protected
|
17
|
+
|
18
|
+
def self.load_stopwords(language, lang_dir)
|
19
|
+
default_dir = File.join(File.dirname(__FILE__), 'stopwords')
|
20
|
+
|
21
|
+
load_file(language, lang_dir) || load_file(language, default_dir) || []
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.load_file(language, lang_dir)
|
25
|
+
return if lang_dir.nil?
|
26
|
+
|
27
|
+
lang_file = File.join(lang_dir, language)
|
28
|
+
if File.exist?(lang_file)
|
29
|
+
data = []
|
30
|
+
File.open(lang_file, 'r:utf-8') do |f|
|
31
|
+
f.each_line do |line|
|
32
|
+
line = line.gsub(/#.*/, '').strip
|
33
|
+
data << line unless line.empty?
|
34
|
+
end
|
35
|
+
end
|
36
|
+
data unless data.empty?
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
STOP_WORDS = {}
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# English stopwords
|
2
|
+
# Extracted from the gem's source code
|
3
|
+
a
|
4
|
+
again
|
5
|
+
all
|
6
|
+
along
|
7
|
+
are
|
8
|
+
also
|
9
|
+
an
|
10
|
+
and
|
11
|
+
as
|
12
|
+
at
|
13
|
+
but
|
14
|
+
by
|
15
|
+
came
|
16
|
+
can
|
17
|
+
cant
|
18
|
+
couldnt
|
19
|
+
did
|
20
|
+
didn
|
21
|
+
didnt
|
22
|
+
do
|
23
|
+
doesnt
|
24
|
+
dont
|
25
|
+
ever
|
26
|
+
first
|
27
|
+
from
|
28
|
+
have
|
29
|
+
her
|
30
|
+
here
|
31
|
+
him
|
32
|
+
how
|
33
|
+
i
|
34
|
+
if
|
35
|
+
in
|
36
|
+
into
|
37
|
+
is
|
38
|
+
isnt
|
39
|
+
it
|
40
|
+
itll
|
41
|
+
just
|
42
|
+
last
|
43
|
+
least
|
44
|
+
like
|
45
|
+
most
|
46
|
+
my
|
47
|
+
new
|
48
|
+
no
|
49
|
+
not
|
50
|
+
now
|
51
|
+
of
|
52
|
+
on
|
53
|
+
or
|
54
|
+
should
|
55
|
+
sinc
|
56
|
+
so
|
57
|
+
some
|
58
|
+
th
|
59
|
+
than
|
60
|
+
this
|
61
|
+
that
|
62
|
+
the
|
63
|
+
their
|
64
|
+
then
|
65
|
+
those
|
66
|
+
to
|
67
|
+
told
|
68
|
+
too
|
69
|
+
true
|
70
|
+
try
|
71
|
+
until
|
72
|
+
url
|
73
|
+
us
|
74
|
+
were
|
75
|
+
when
|
76
|
+
whether
|
77
|
+
while
|
78
|
+
with
|
79
|
+
within
|
80
|
+
yes
|
81
|
+
you
|
82
|
+
youll
|
@@ -0,0 +1,339 @@
|
|
1
|
+
# Spanish stopwords
|
2
|
+
# http://snowball.tartarus.org/algorithms/spanish/stop.txt
|
3
|
+
de # from, of
|
4
|
+
la # the, her
|
5
|
+
que # who, that
|
6
|
+
el # the
|
7
|
+
en # in
|
8
|
+
y # and
|
9
|
+
a # to
|
10
|
+
los # the, them
|
11
|
+
del # de + el
|
12
|
+
se # himself, from him etc
|
13
|
+
las # the, them
|
14
|
+
por # for, by, etc
|
15
|
+
un # a
|
16
|
+
para # for
|
17
|
+
con # with
|
18
|
+
no # no
|
19
|
+
una # a
|
20
|
+
su # his, her
|
21
|
+
al # a + el
|
22
|
+
es # from SER
|
23
|
+
lo # him
|
24
|
+
como # how
|
25
|
+
más # more
|
26
|
+
pero # pero
|
27
|
+
sus # su plural
|
28
|
+
le # to him, her
|
29
|
+
ya # already
|
30
|
+
o # or
|
31
|
+
fue # from SER
|
32
|
+
este # this
|
33
|
+
ha # from HABER
|
34
|
+
sí # himself etc
|
35
|
+
porque # because
|
36
|
+
esta # this
|
37
|
+
son # from SER
|
38
|
+
entre # between
|
39
|
+
está # from ESTAR
|
40
|
+
cuando # when
|
41
|
+
muy # very
|
42
|
+
sin # without
|
43
|
+
sobre # on
|
44
|
+
ser # from SER
|
45
|
+
tiene # from TENER
|
46
|
+
también # also
|
47
|
+
me # me
|
48
|
+
hasta # until
|
49
|
+
hay # there is/are
|
50
|
+
donde # where
|
51
|
+
han # from HABER
|
52
|
+
quien # whom, that
|
53
|
+
están # from ESTAR
|
54
|
+
estado # from ESTAR
|
55
|
+
desde # from
|
56
|
+
todo # all
|
57
|
+
nos # us
|
58
|
+
durante # during
|
59
|
+
estados # from ESTAR
|
60
|
+
todos # all
|
61
|
+
uno # a
|
62
|
+
les # to them
|
63
|
+
ni # nor
|
64
|
+
contra # against
|
65
|
+
otros # other
|
66
|
+
fueron # from SER
|
67
|
+
ese # that
|
68
|
+
eso # that
|
69
|
+
había # from HABER
|
70
|
+
ante # before
|
71
|
+
ellos # they
|
72
|
+
e # and (variant of y)
|
73
|
+
esto # this
|
74
|
+
mí # me
|
75
|
+
antes # before
|
76
|
+
algunos # some
|
77
|
+
qué # what?
|
78
|
+
unos # a
|
79
|
+
yo # I
|
80
|
+
otro # other
|
81
|
+
otras # other
|
82
|
+
otra # other
|
83
|
+
él # he
|
84
|
+
tanto # so much, many
|
85
|
+
esa # that
|
86
|
+
estos # these
|
87
|
+
mucho # much, many
|
88
|
+
quienes # who
|
89
|
+
nada # nothing
|
90
|
+
muchos # many
|
91
|
+
cual # who
|
92
|
+
sea # from SER
|
93
|
+
poco # few
|
94
|
+
ella # she
|
95
|
+
estar # to be
|
96
|
+
haber # from HABER
|
97
|
+
estas # these
|
98
|
+
estaba # from ESTAR
|
99
|
+
estamos # from ESTAR
|
100
|
+
algunas # some
|
101
|
+
algo # something
|
102
|
+
nosotros # we
|
103
|
+
|
104
|
+
# other forms
|
105
|
+
|
106
|
+
mi # me
|
107
|
+
mis # mi plural
|
108
|
+
tú # thou
|
109
|
+
te # thee
|
110
|
+
ti # thee
|
111
|
+
tu # thy
|
112
|
+
tus # tu plural
|
113
|
+
ellas # they
|
114
|
+
nosotras # we
|
115
|
+
vosotros # you
|
116
|
+
vosotras # you
|
117
|
+
os # you
|
118
|
+
mío # mine
|
119
|
+
mía #
|
120
|
+
míos #
|
121
|
+
mías #
|
122
|
+
tuyo # thine
|
123
|
+
tuya #
|
124
|
+
tuyos #
|
125
|
+
tuyas #
|
126
|
+
suyo # his, hers, theirs
|
127
|
+
suya #
|
128
|
+
suyos #
|
129
|
+
suyas #
|
130
|
+
nuestro # ours
|
131
|
+
nuestra #
|
132
|
+
nuestros #
|
133
|
+
nuestras #
|
134
|
+
vuestro # yours
|
135
|
+
vuestra #
|
136
|
+
vuestros #
|
137
|
+
vuestras #
|
138
|
+
esos # those
|
139
|
+
esas # those
|
140
|
+
|
141
|
+
# forms of estar, to be (not including the infinitive):
|
142
|
+
estoy
|
143
|
+
estás
|
144
|
+
está
|
145
|
+
estamos
|
146
|
+
estáis
|
147
|
+
están
|
148
|
+
esté
|
149
|
+
estés
|
150
|
+
estemos
|
151
|
+
estéis
|
152
|
+
estén
|
153
|
+
estaré
|
154
|
+
estarás
|
155
|
+
estará
|
156
|
+
estaremos
|
157
|
+
estaréis
|
158
|
+
estarán
|
159
|
+
estaría
|
160
|
+
estarías
|
161
|
+
estaríamos
|
162
|
+
estaríais
|
163
|
+
estarían
|
164
|
+
estaba
|
165
|
+
estabas
|
166
|
+
estábamos
|
167
|
+
estabais
|
168
|
+
estaban
|
169
|
+
estuve
|
170
|
+
estuviste
|
171
|
+
estuvo
|
172
|
+
estuvimos
|
173
|
+
estuvisteis
|
174
|
+
estuvieron
|
175
|
+
estuviera
|
176
|
+
estuvieras
|
177
|
+
estuviéramos
|
178
|
+
estuvierais
|
179
|
+
estuvieran
|
180
|
+
estuviese
|
181
|
+
estuvieses
|
182
|
+
estuviésemos
|
183
|
+
estuvieseis
|
184
|
+
estuviesen
|
185
|
+
estando
|
186
|
+
estado
|
187
|
+
estada
|
188
|
+
estados
|
189
|
+
estadas
|
190
|
+
estad
|
191
|
+
|
192
|
+
# forms of haber, to have (not including the infinitive):
|
193
|
+
he
|
194
|
+
has
|
195
|
+
ha
|
196
|
+
hemos
|
197
|
+
habéis
|
198
|
+
han
|
199
|
+
haya
|
200
|
+
hayas
|
201
|
+
hayamos
|
202
|
+
hayáis
|
203
|
+
hayan
|
204
|
+
habré
|
205
|
+
habrás
|
206
|
+
habrá
|
207
|
+
habremos
|
208
|
+
habréis
|
209
|
+
habrán
|
210
|
+
habría
|
211
|
+
habrías
|
212
|
+
habríamos
|
213
|
+
habríais
|
214
|
+
habrían
|
215
|
+
había
|
216
|
+
habías
|
217
|
+
habíamos
|
218
|
+
habíais
|
219
|
+
habían
|
220
|
+
hube
|
221
|
+
hubiste
|
222
|
+
hubo
|
223
|
+
hubimos
|
224
|
+
hubisteis
|
225
|
+
hubieron
|
226
|
+
hubiera
|
227
|
+
hubieras
|
228
|
+
hubiéramos
|
229
|
+
hubierais
|
230
|
+
hubieran
|
231
|
+
hubiese
|
232
|
+
hubieses
|
233
|
+
hubiésemos
|
234
|
+
hubieseis
|
235
|
+
hubiesen
|
236
|
+
habiendo
|
237
|
+
habido
|
238
|
+
habida
|
239
|
+
habidos
|
240
|
+
habidas
|
241
|
+
|
242
|
+
# forms of ser, to be (not including the infinitive):
|
243
|
+
soy
|
244
|
+
eres
|
245
|
+
es
|
246
|
+
somos
|
247
|
+
sois
|
248
|
+
son
|
249
|
+
sea
|
250
|
+
seas
|
251
|
+
seamos
|
252
|
+
seáis
|
253
|
+
sean
|
254
|
+
seré
|
255
|
+
serás
|
256
|
+
será
|
257
|
+
seremos
|
258
|
+
seréis
|
259
|
+
serán
|
260
|
+
sería
|
261
|
+
serías
|
262
|
+
seríamos
|
263
|
+
seríais
|
264
|
+
serían
|
265
|
+
era
|
266
|
+
eras
|
267
|
+
éramos
|
268
|
+
erais
|
269
|
+
eran
|
270
|
+
fui
|
271
|
+
fuiste
|
272
|
+
fue
|
273
|
+
fuimos
|
274
|
+
fuisteis
|
275
|
+
fueron
|
276
|
+
fuera
|
277
|
+
fueras
|
278
|
+
fuéramos
|
279
|
+
fuerais
|
280
|
+
fueran
|
281
|
+
fuese
|
282
|
+
fueses
|
283
|
+
fuésemos
|
284
|
+
fueseis
|
285
|
+
fuesen
|
286
|
+
siendo
|
287
|
+
sido
|
288
|
+
# sed also means 'thirst'
|
289
|
+
|
290
|
+
# forms of tener, to have (not including the infinitive):
|
291
|
+
tengo
|
292
|
+
tienes
|
293
|
+
tiene
|
294
|
+
tenemos
|
295
|
+
tenéis
|
296
|
+
tienen
|
297
|
+
tenga
|
298
|
+
tengas
|
299
|
+
tengamos
|
300
|
+
tengáis
|
301
|
+
tengan
|
302
|
+
tendré
|
303
|
+
tendrás
|
304
|
+
tendrá
|
305
|
+
tendremos
|
306
|
+
tendréis
|
307
|
+
tendrán
|
308
|
+
tendría
|
309
|
+
tendrías
|
310
|
+
tendríamos
|
311
|
+
tendríais
|
312
|
+
tendrían
|
313
|
+
tenía
|
314
|
+
tenías
|
315
|
+
teníamos
|
316
|
+
teníais
|
317
|
+
tenían
|
318
|
+
tuve
|
319
|
+
tuviste
|
320
|
+
tuvo
|
321
|
+
tuvimos
|
322
|
+
tuvisteis
|
323
|
+
tuvieron
|
324
|
+
tuviera
|
325
|
+
tuvieras
|
326
|
+
tuviéramos
|
327
|
+
tuvierais
|
328
|
+
tuvieran
|
329
|
+
tuviese
|
330
|
+
tuvieses
|
331
|
+
tuviésemos
|
332
|
+
tuvieseis
|
333
|
+
tuviesen
|
334
|
+
teniendo
|
335
|
+
tenido
|
336
|
+
tenida
|
337
|
+
tenidos
|
338
|
+
tenidas
|
339
|
+
tened
|