logankoester-classifier 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +429 -0
- data/Manifest +19 -0
- data/README.rdoc +124 -0
- data/Rakefile +21 -0
- data/VERSION.yml +5 -0
- data/lib/classifier.rb +31 -0
- data/lib/classifier/base.rb +65 -0
- data/lib/classifier/bayes.rb +145 -0
- data/lib/classifier/extensions/vector.rb +100 -0
- data/lib/classifier/extensions/vector_serialize.rb +20 -0
- data/lib/classifier/lsi.rb +348 -0
- data/lib/classifier/lsi/content_node.rb +73 -0
- data/lib/classifier/lsi/summary.rb +31 -0
- data/lib/classifier/lsi/word_list.rb +36 -0
- data/lib/classifier/stopwords.rb +42 -0
- data/lib/classifier/stopwords/en +82 -0
- data/lib/classifier/stopwords/es +339 -0
- data/lib/classifier/stopwords/ru +161 -0
- data/lib/init.rb +1 -0
- data/tasks/test.rake +6 -0
- data/test/base_test.rb +17 -0
- data/test/bayes/bayesian_test.rb +68 -0
- data/test/lsi/lsi_test.rb +167 -0
- data/test/stopwords_test.rb +38 -0
- data/test/test_helper.rb +4 -0
- metadata +127 -0
@@ -0,0 +1,73 @@
|
|
1
|
+
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
|
+
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
module Classifier
|
6
|
+
|
7
|
+
# This is an internal data structure class for the LSI node. Save for
|
8
|
+
# raw_vector_with, it should be fairly straightforward to understand.
|
9
|
+
# You should never have to use it directly.
|
10
|
+
class ContentNode
|
11
|
+
attr_accessor :raw_vector, :raw_norm,
|
12
|
+
:lsi_vector, :lsi_norm,
|
13
|
+
:categories
|
14
|
+
|
15
|
+
attr_reader :word_hash
|
16
|
+
# If text_proc is not specified, the source will be duck-typed
|
17
|
+
# via source.to_s
|
18
|
+
def initialize( word_hash, *categories )
|
19
|
+
@categories = categories || []
|
20
|
+
@word_hash = word_hash
|
21
|
+
end
|
22
|
+
|
23
|
+
# Use this to fetch the appropriate search vector.
|
24
|
+
def search_vector
|
25
|
+
@lsi_vector || @raw_vector
|
26
|
+
end
|
27
|
+
|
28
|
+
# Use this to fetch the appropriate search vector in normalized form.
|
29
|
+
def search_norm
|
30
|
+
@lsi_norm || @raw_norm
|
31
|
+
end
|
32
|
+
|
33
|
+
# Creates the raw vector out of word_hash using word_list as the
|
34
|
+
# key for mapping the vector space.
|
35
|
+
def raw_vector_with( word_list )
|
36
|
+
if $GSL
|
37
|
+
vec = GSL::Vector.alloc(word_list.size)
|
38
|
+
else
|
39
|
+
vec = Array.new(word_list.size, 0)
|
40
|
+
end
|
41
|
+
|
42
|
+
@word_hash.each_key do |word|
|
43
|
+
vec[word_list[word]] = @word_hash[word] if word_list[word]
|
44
|
+
end
|
45
|
+
|
46
|
+
# Perform the scaling transform
|
47
|
+
total_words = vec.sum.to_f
|
48
|
+
|
49
|
+
# Perform first-order association transform if this vector has more
|
50
|
+
# than one word in it.
|
51
|
+
if total_words > 1.0
|
52
|
+
weighted_total = 0.0
|
53
|
+
vec.each do |term|
|
54
|
+
if ( term > 0 )
|
55
|
+
weighted_total += (( term / total_words ) * Math.log( term / total_words ))
|
56
|
+
end
|
57
|
+
end
|
58
|
+
weighted_total = -1.0 if weighted_total.zero? # if no word in list is known
|
59
|
+
vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
|
60
|
+
end
|
61
|
+
|
62
|
+
if $GSL
|
63
|
+
@raw_norm = vec.normalize
|
64
|
+
@raw_vector = vec
|
65
|
+
else
|
66
|
+
@raw_norm = Vector[*vec].normalize
|
67
|
+
@raw_vector = Vector[*vec]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
class String
|
6
|
+
def summary( count=10, separator=" [...] " )
|
7
|
+
perform_lsi split_sentences, count, separator
|
8
|
+
end
|
9
|
+
|
10
|
+
def paragraph_summary( count=1, separator=" [...] " )
|
11
|
+
perform_lsi split_paragraphs, count, separator
|
12
|
+
end
|
13
|
+
|
14
|
+
def split_sentences
|
15
|
+
split /(\.|\!|\?)/ # TODO: make this less primitive
|
16
|
+
end
|
17
|
+
|
18
|
+
def split_paragraphs
|
19
|
+
split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def perform_lsi(chunks, count, separator)
|
25
|
+
lsi = Classifier::LSI.new :auto_rebuild => false
|
26
|
+
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
|
27
|
+
lsi.build_index
|
28
|
+
summaries = lsi.highest_relative_content count
|
29
|
+
return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
|
+
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
module Classifier
|
6
|
+
# This class keeps a word => index mapping. It is used to map stemmed words
|
7
|
+
# to dimensions of a vector.
|
8
|
+
|
9
|
+
class WordList
|
10
|
+
def initialize
|
11
|
+
@location_table = Hash.new
|
12
|
+
end
|
13
|
+
|
14
|
+
# Adds a word (if it is new) and assigns it a unique dimension.
|
15
|
+
def add_word(word)
|
16
|
+
term = word
|
17
|
+
@location_table[term] = @location_table.size unless @location_table[term]
|
18
|
+
end
|
19
|
+
|
20
|
+
# Returns the dimension of the word or nil if the word is not in the space.
|
21
|
+
def [](lookup)
|
22
|
+
term = lookup
|
23
|
+
@location_table[term]
|
24
|
+
end
|
25
|
+
|
26
|
+
def word_for_index(ind)
|
27
|
+
@location_table.invert[ind]
|
28
|
+
end
|
29
|
+
|
30
|
+
# Returns the number of words mapped.
|
31
|
+
def size
|
32
|
+
@location_table.size
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Classifier
|
2
|
+
|
3
|
+
module StopWords
|
4
|
+
|
5
|
+
def self.for(language, lang_dir=nil)
|
6
|
+
unless STOP_WORDS.has_key?(language)
|
7
|
+
STOP_WORDS[language] = load_stopwords(language, lang_dir) || []
|
8
|
+
end
|
9
|
+
STOP_WORDS[language]
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.reset
|
13
|
+
STOP_WORDS.clear
|
14
|
+
end
|
15
|
+
|
16
|
+
protected
|
17
|
+
|
18
|
+
def self.load_stopwords(language, lang_dir)
|
19
|
+
default_dir = File.join(File.dirname(__FILE__), 'stopwords')
|
20
|
+
|
21
|
+
load_file(language, lang_dir) || load_file(language, default_dir) || []
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.load_file(language, lang_dir)
|
25
|
+
return if lang_dir.nil?
|
26
|
+
|
27
|
+
lang_file = File.join(lang_dir, language)
|
28
|
+
if File.exist?(lang_file)
|
29
|
+
data = []
|
30
|
+
File.open(lang_file, 'r:utf-8') do |f|
|
31
|
+
f.each_line do |line|
|
32
|
+
line = line.gsub(/#.*/, '').strip
|
33
|
+
data << line unless line.empty?
|
34
|
+
end
|
35
|
+
end
|
36
|
+
data unless data.empty?
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
STOP_WORDS = {}
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# English stopwords
|
2
|
+
# Extracted from the gem's source code
|
3
|
+
a
|
4
|
+
again
|
5
|
+
all
|
6
|
+
along
|
7
|
+
are
|
8
|
+
also
|
9
|
+
an
|
10
|
+
and
|
11
|
+
as
|
12
|
+
at
|
13
|
+
but
|
14
|
+
by
|
15
|
+
came
|
16
|
+
can
|
17
|
+
cant
|
18
|
+
couldnt
|
19
|
+
did
|
20
|
+
didn
|
21
|
+
didnt
|
22
|
+
do
|
23
|
+
doesnt
|
24
|
+
dont
|
25
|
+
ever
|
26
|
+
first
|
27
|
+
from
|
28
|
+
have
|
29
|
+
her
|
30
|
+
here
|
31
|
+
him
|
32
|
+
how
|
33
|
+
i
|
34
|
+
if
|
35
|
+
in
|
36
|
+
into
|
37
|
+
is
|
38
|
+
isnt
|
39
|
+
it
|
40
|
+
itll
|
41
|
+
just
|
42
|
+
last
|
43
|
+
least
|
44
|
+
like
|
45
|
+
most
|
46
|
+
my
|
47
|
+
new
|
48
|
+
no
|
49
|
+
not
|
50
|
+
now
|
51
|
+
of
|
52
|
+
on
|
53
|
+
or
|
54
|
+
should
|
55
|
+
sinc
|
56
|
+
so
|
57
|
+
some
|
58
|
+
th
|
59
|
+
than
|
60
|
+
this
|
61
|
+
that
|
62
|
+
the
|
63
|
+
their
|
64
|
+
then
|
65
|
+
those
|
66
|
+
to
|
67
|
+
told
|
68
|
+
too
|
69
|
+
true
|
70
|
+
try
|
71
|
+
until
|
72
|
+
url
|
73
|
+
us
|
74
|
+
were
|
75
|
+
when
|
76
|
+
whether
|
77
|
+
while
|
78
|
+
with
|
79
|
+
within
|
80
|
+
yes
|
81
|
+
you
|
82
|
+
youll
|
@@ -0,0 +1,339 @@
|
|
1
|
+
# Spanish stopwords
|
2
|
+
# http://snowball.tartarus.org/algorithms/spanish/stop.txt
|
3
|
+
de # from, of
|
4
|
+
la # the, her
|
5
|
+
que # who, that
|
6
|
+
el # the
|
7
|
+
en # in
|
8
|
+
y # and
|
9
|
+
a # to
|
10
|
+
los # the, them
|
11
|
+
del # de + el
|
12
|
+
se # himself, from him etc
|
13
|
+
las # the, them
|
14
|
+
por # for, by, etc
|
15
|
+
un # a
|
16
|
+
para # for
|
17
|
+
con # with
|
18
|
+
no # no
|
19
|
+
una # a
|
20
|
+
su # his, her
|
21
|
+
al # a + el
|
22
|
+
es # from SER
|
23
|
+
lo # him
|
24
|
+
como # how
|
25
|
+
más # more
|
26
|
+
pero # pero
|
27
|
+
sus # su plural
|
28
|
+
le # to him, her
|
29
|
+
ya # already
|
30
|
+
o # or
|
31
|
+
fue # from SER
|
32
|
+
este # this
|
33
|
+
ha # from HABER
|
34
|
+
sí # himself etc
|
35
|
+
porque # because
|
36
|
+
esta # this
|
37
|
+
son # from SER
|
38
|
+
entre # between
|
39
|
+
está # from ESTAR
|
40
|
+
cuando # when
|
41
|
+
muy # very
|
42
|
+
sin # without
|
43
|
+
sobre # on
|
44
|
+
ser # from SER
|
45
|
+
tiene # from TENER
|
46
|
+
también # also
|
47
|
+
me # me
|
48
|
+
hasta # until
|
49
|
+
hay # there is/are
|
50
|
+
donde # where
|
51
|
+
han # from HABER
|
52
|
+
quien # whom, that
|
53
|
+
están # from ESTAR
|
54
|
+
estado # from ESTAR
|
55
|
+
desde # from
|
56
|
+
todo # all
|
57
|
+
nos # us
|
58
|
+
durante # during
|
59
|
+
estados # from ESTAR
|
60
|
+
todos # all
|
61
|
+
uno # a
|
62
|
+
les # to them
|
63
|
+
ni # nor
|
64
|
+
contra # against
|
65
|
+
otros # other
|
66
|
+
fueron # from SER
|
67
|
+
ese # that
|
68
|
+
eso # that
|
69
|
+
había # from HABER
|
70
|
+
ante # before
|
71
|
+
ellos # they
|
72
|
+
e # and (variant of y)
|
73
|
+
esto # this
|
74
|
+
mí # me
|
75
|
+
antes # before
|
76
|
+
algunos # some
|
77
|
+
qué # what?
|
78
|
+
unos # a
|
79
|
+
yo # I
|
80
|
+
otro # other
|
81
|
+
otras # other
|
82
|
+
otra # other
|
83
|
+
él # he
|
84
|
+
tanto # so much, many
|
85
|
+
esa # that
|
86
|
+
estos # these
|
87
|
+
mucho # much, many
|
88
|
+
quienes # who
|
89
|
+
nada # nothing
|
90
|
+
muchos # many
|
91
|
+
cual # who
|
92
|
+
sea # from SER
|
93
|
+
poco # few
|
94
|
+
ella # she
|
95
|
+
estar # to be
|
96
|
+
haber # from HABER
|
97
|
+
estas # these
|
98
|
+
estaba # from ESTAR
|
99
|
+
estamos # from ESTAR
|
100
|
+
algunas # some
|
101
|
+
algo # something
|
102
|
+
nosotros # we
|
103
|
+
|
104
|
+
# other forms
|
105
|
+
|
106
|
+
mi # me
|
107
|
+
mis # mi plural
|
108
|
+
tú # thou
|
109
|
+
te # thee
|
110
|
+
ti # thee
|
111
|
+
tu # thy
|
112
|
+
tus # tu plural
|
113
|
+
ellas # they
|
114
|
+
nosotras # we
|
115
|
+
vosotros # you
|
116
|
+
vosotras # you
|
117
|
+
os # you
|
118
|
+
mío # mine
|
119
|
+
mía #
|
120
|
+
míos #
|
121
|
+
mías #
|
122
|
+
tuyo # thine
|
123
|
+
tuya #
|
124
|
+
tuyos #
|
125
|
+
tuyas #
|
126
|
+
suyo # his, hers, theirs
|
127
|
+
suya #
|
128
|
+
suyos #
|
129
|
+
suyas #
|
130
|
+
nuestro # ours
|
131
|
+
nuestra #
|
132
|
+
nuestros #
|
133
|
+
nuestras #
|
134
|
+
vuestro # yours
|
135
|
+
vuestra #
|
136
|
+
vuestros #
|
137
|
+
vuestras #
|
138
|
+
esos # those
|
139
|
+
esas # those
|
140
|
+
|
141
|
+
# forms of estar, to be (not including the infinitive):
|
142
|
+
estoy
|
143
|
+
estás
|
144
|
+
está
|
145
|
+
estamos
|
146
|
+
estáis
|
147
|
+
están
|
148
|
+
esté
|
149
|
+
estés
|
150
|
+
estemos
|
151
|
+
estéis
|
152
|
+
estén
|
153
|
+
estaré
|
154
|
+
estarás
|
155
|
+
estará
|
156
|
+
estaremos
|
157
|
+
estaréis
|
158
|
+
estarán
|
159
|
+
estaría
|
160
|
+
estarías
|
161
|
+
estaríamos
|
162
|
+
estaríais
|
163
|
+
estarían
|
164
|
+
estaba
|
165
|
+
estabas
|
166
|
+
estábamos
|
167
|
+
estabais
|
168
|
+
estaban
|
169
|
+
estuve
|
170
|
+
estuviste
|
171
|
+
estuvo
|
172
|
+
estuvimos
|
173
|
+
estuvisteis
|
174
|
+
estuvieron
|
175
|
+
estuviera
|
176
|
+
estuvieras
|
177
|
+
estuviéramos
|
178
|
+
estuvierais
|
179
|
+
estuvieran
|
180
|
+
estuviese
|
181
|
+
estuvieses
|
182
|
+
estuviésemos
|
183
|
+
estuvieseis
|
184
|
+
estuviesen
|
185
|
+
estando
|
186
|
+
estado
|
187
|
+
estada
|
188
|
+
estados
|
189
|
+
estadas
|
190
|
+
estad
|
191
|
+
|
192
|
+
# forms of haber, to have (not including the infinitive):
|
193
|
+
he
|
194
|
+
has
|
195
|
+
ha
|
196
|
+
hemos
|
197
|
+
habéis
|
198
|
+
han
|
199
|
+
haya
|
200
|
+
hayas
|
201
|
+
hayamos
|
202
|
+
hayáis
|
203
|
+
hayan
|
204
|
+
habré
|
205
|
+
habrás
|
206
|
+
habrá
|
207
|
+
habremos
|
208
|
+
habréis
|
209
|
+
habrán
|
210
|
+
habría
|
211
|
+
habrías
|
212
|
+
habríamos
|
213
|
+
habríais
|
214
|
+
habrían
|
215
|
+
había
|
216
|
+
habías
|
217
|
+
habíamos
|
218
|
+
habíais
|
219
|
+
habían
|
220
|
+
hube
|
221
|
+
hubiste
|
222
|
+
hubo
|
223
|
+
hubimos
|
224
|
+
hubisteis
|
225
|
+
hubieron
|
226
|
+
hubiera
|
227
|
+
hubieras
|
228
|
+
hubiéramos
|
229
|
+
hubierais
|
230
|
+
hubieran
|
231
|
+
hubiese
|
232
|
+
hubieses
|
233
|
+
hubiésemos
|
234
|
+
hubieseis
|
235
|
+
hubiesen
|
236
|
+
habiendo
|
237
|
+
habido
|
238
|
+
habida
|
239
|
+
habidos
|
240
|
+
habidas
|
241
|
+
|
242
|
+
# forms of ser, to be (not including the infinitive):
|
243
|
+
soy
|
244
|
+
eres
|
245
|
+
es
|
246
|
+
somos
|
247
|
+
sois
|
248
|
+
son
|
249
|
+
sea
|
250
|
+
seas
|
251
|
+
seamos
|
252
|
+
seáis
|
253
|
+
sean
|
254
|
+
seré
|
255
|
+
serás
|
256
|
+
será
|
257
|
+
seremos
|
258
|
+
seréis
|
259
|
+
serán
|
260
|
+
sería
|
261
|
+
serías
|
262
|
+
seríamos
|
263
|
+
seríais
|
264
|
+
serían
|
265
|
+
era
|
266
|
+
eras
|
267
|
+
éramos
|
268
|
+
erais
|
269
|
+
eran
|
270
|
+
fui
|
271
|
+
fuiste
|
272
|
+
fue
|
273
|
+
fuimos
|
274
|
+
fuisteis
|
275
|
+
fueron
|
276
|
+
fuera
|
277
|
+
fueras
|
278
|
+
fuéramos
|
279
|
+
fuerais
|
280
|
+
fueran
|
281
|
+
fuese
|
282
|
+
fueses
|
283
|
+
fuésemos
|
284
|
+
fueseis
|
285
|
+
fuesen
|
286
|
+
siendo
|
287
|
+
sido
|
288
|
+
# sed also means 'thirst'
|
289
|
+
|
290
|
+
# forms of tener, to have (not including the infinitive):
|
291
|
+
tengo
|
292
|
+
tienes
|
293
|
+
tiene
|
294
|
+
tenemos
|
295
|
+
tenéis
|
296
|
+
tienen
|
297
|
+
tenga
|
298
|
+
tengas
|
299
|
+
tengamos
|
300
|
+
tengáis
|
301
|
+
tengan
|
302
|
+
tendré
|
303
|
+
tendrás
|
304
|
+
tendrá
|
305
|
+
tendremos
|
306
|
+
tendréis
|
307
|
+
tendrán
|
308
|
+
tendría
|
309
|
+
tendrías
|
310
|
+
tendríamos
|
311
|
+
tendríais
|
312
|
+
tendrían
|
313
|
+
tenía
|
314
|
+
tenías
|
315
|
+
teníamos
|
316
|
+
teníais
|
317
|
+
tenían
|
318
|
+
tuve
|
319
|
+
tuviste
|
320
|
+
tuvo
|
321
|
+
tuvimos
|
322
|
+
tuvisteis
|
323
|
+
tuvieron
|
324
|
+
tuviera
|
325
|
+
tuvieras
|
326
|
+
tuviéramos
|
327
|
+
tuvierais
|
328
|
+
tuvieran
|
329
|
+
tuviese
|
330
|
+
tuvieses
|
331
|
+
tuviésemos
|
332
|
+
tuvieseis
|
333
|
+
tuviesen
|
334
|
+
teniendo
|
335
|
+
tenido
|
336
|
+
tenida
|
337
|
+
tenidos
|
338
|
+
tenidas
|
339
|
+
tened
|