categorize 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/categorize.rb +49 -0
- data/lib/constants.rb +541 -0
- data/lib/models/bag_of_words.rb +97 -0
- data/lib/utils/grams.rb +45 -0
- metadata +49 -0
data/lib/categorize.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'models', 'bag_of_words')
|
2
|
+
require File.join(File.dirname(__FILE__), 'constants')
|
3
|
+
|
4
|
+
module Categorize
|
5
|
+
MIN_WORD_LENGTH = 3
|
6
|
+
@bag_of_words = BagOfWords.new
|
7
|
+
|
8
|
+
class << self
|
9
|
+
#include Bow
|
10
|
+
# ==== Return
|
11
|
+
# Hash - category => results
|
12
|
+
# ==== Parameters
|
13
|
+
# documents:: a list of documents to be classified
|
14
|
+
def make_model(query, documents, topic_model = @bag_of_words)
|
15
|
+
records_to_tokens = lexicalize(documents)
|
16
|
+
topic_model.model(query.downcase.strip, records_to_tokens)
|
17
|
+
end
|
18
|
+
|
19
|
+
# ==== Return
|
20
|
+
# Hash - category => results
|
21
|
+
# ==== Parameters
|
22
|
+
# items:: the items to be classified
|
23
|
+
def make_model_c(strings)
|
24
|
+
strings.map { |s| preprocess(s) }
|
25
|
+
#ret = model_bow(array_of_tokens);
|
26
|
+
count = 0
|
27
|
+
ret.inject({}) do |hash, term|
|
28
|
+
hash[term] ||= []
|
29
|
+
hash[term] << count += 1
|
30
|
+
hash
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
def lexicalize(strings)
|
36
|
+
Hash[
|
37
|
+
(0..(strings.length - 1)).zip(strings.map { |s| preprocess(s) })
|
38
|
+
]
|
39
|
+
end
|
40
|
+
|
41
|
+
def preprocess(string)
|
42
|
+
string.split(Constants::Words::SPLIT_REGEX).map(&:downcase).delete_if do
|
43
|
+
|word|
|
44
|
+
word.length < MIN_WORD_LENGTH ||
|
45
|
+
Constants::Words::COMMON.include?(word)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/constants.rb
ADDED
@@ -0,0 +1,541 @@
|
|
1
|
+
module Constants
|
2
|
+
module Words
|
3
|
+
# only include words > 2 chars
|
4
|
+
ENGLISH = %w(
|
5
|
+
000
|
6
|
+
page
|
7
|
+
home
|
8
|
+
free
|
9
|
+
also
|
10
|
+
about
|
11
|
+
above
|
12
|
+
according
|
13
|
+
accordingly
|
14
|
+
across
|
15
|
+
after
|
16
|
+
afterward
|
17
|
+
afterwards
|
18
|
+
again
|
19
|
+
against
|
20
|
+
all
|
21
|
+
almost
|
22
|
+
alone
|
23
|
+
along
|
24
|
+
already
|
25
|
+
also
|
26
|
+
although
|
27
|
+
always
|
28
|
+
among
|
29
|
+
amongst
|
30
|
+
amp
|
31
|
+
and
|
32
|
+
another
|
33
|
+
any
|
34
|
+
anyhow
|
35
|
+
anyone
|
36
|
+
anything
|
37
|
+
anywhere
|
38
|
+
apr
|
39
|
+
are
|
40
|
+
aug
|
41
|
+
around
|
42
|
+
became
|
43
|
+
because
|
44
|
+
become
|
45
|
+
becomes
|
46
|
+
becoming
|
47
|
+
been
|
48
|
+
before
|
49
|
+
beforehand
|
50
|
+
began
|
51
|
+
behind
|
52
|
+
being
|
53
|
+
below
|
54
|
+
beside
|
55
|
+
besides
|
56
|
+
between
|
57
|
+
beyond
|
58
|
+
both
|
59
|
+
but
|
60
|
+
can
|
61
|
+
cannot
|
62
|
+
certain
|
63
|
+
com
|
64
|
+
could
|
65
|
+
days ago
|
66
|
+
dec
|
67
|
+
did
|
68
|
+
does
|
69
|
+
down
|
70
|
+
during
|
71
|
+
each
|
72
|
+
edu
|
73
|
+
either
|
74
|
+
else
|
75
|
+
elsewhere
|
76
|
+
enough
|
77
|
+
especially
|
78
|
+
est
|
79
|
+
etc
|
80
|
+
even
|
81
|
+
ever
|
82
|
+
every
|
83
|
+
everyone
|
84
|
+
everything
|
85
|
+
everywhere
|
86
|
+
example
|
87
|
+
except
|
88
|
+
feb
|
89
|
+
few
|
90
|
+
fewer
|
91
|
+
finally
|
92
|
+
find
|
93
|
+
following
|
94
|
+
for
|
95
|
+
former
|
96
|
+
formerly
|
97
|
+
from
|
98
|
+
further
|
99
|
+
furthermore
|
100
|
+
generally
|
101
|
+
get
|
102
|
+
given
|
103
|
+
had
|
104
|
+
has
|
105
|
+
have
|
106
|
+
having
|
107
|
+
hence
|
108
|
+
henceforth
|
109
|
+
her
|
110
|
+
here
|
111
|
+
hereafter
|
112
|
+
hereby
|
113
|
+
herein
|
114
|
+
hereupon
|
115
|
+
hers
|
116
|
+
herself
|
117
|
+
him
|
118
|
+
himself
|
119
|
+
his
|
120
|
+
hours ago
|
121
|
+
how
|
122
|
+
however
|
123
|
+
http
|
124
|
+
inc
|
125
|
+
include
|
126
|
+
included
|
127
|
+
includes
|
128
|
+
including
|
129
|
+
indeed
|
130
|
+
instead
|
131
|
+
into
|
132
|
+
its
|
133
|
+
itself
|
134
|
+
jan
|
135
|
+
jul
|
136
|
+
know
|
137
|
+
known
|
138
|
+
later
|
139
|
+
latterly
|
140
|
+
ldquo
|
141
|
+
llc
|
142
|
+
lquo
|
143
|
+
least
|
144
|
+
less
|
145
|
+
many
|
146
|
+
mar
|
147
|
+
may
|
148
|
+
maybe
|
149
|
+
mdash
|
150
|
+
meanwhile
|
151
|
+
might
|
152
|
+
miss
|
153
|
+
more
|
154
|
+
moreover
|
155
|
+
most
|
156
|
+
mostly
|
157
|
+
much
|
158
|
+
must
|
159
|
+
myself
|
160
|
+
nbsp
|
161
|
+
ndash
|
162
|
+
near
|
163
|
+
nearly
|
164
|
+
neither
|
165
|
+
never
|
166
|
+
nevertheless
|
167
|
+
next
|
168
|
+
nobody
|
169
|
+
non
|
170
|
+
none
|
171
|
+
nonetheless
|
172
|
+
nor
|
173
|
+
not
|
174
|
+
nothing
|
175
|
+
nov
|
176
|
+
now
|
177
|
+
nowhere
|
178
|
+
oct
|
179
|
+
off
|
180
|
+
often
|
181
|
+
once
|
182
|
+
one
|
183
|
+
only
|
184
|
+
onto
|
185
|
+
org
|
186
|
+
other
|
187
|
+
others
|
188
|
+
otherwise
|
189
|
+
our
|
190
|
+
ours
|
191
|
+
ourselves
|
192
|
+
out
|
193
|
+
over
|
194
|
+
overall
|
195
|
+
own
|
196
|
+
part
|
197
|
+
particularly
|
198
|
+
parts
|
199
|
+
per
|
200
|
+
perhaps
|
201
|
+
probably
|
202
|
+
quot
|
203
|
+
rather
|
204
|
+
rdquo
|
205
|
+
rquo
|
206
|
+
said
|
207
|
+
same
|
208
|
+
seem
|
209
|
+
seemed
|
210
|
+
seeming
|
211
|
+
seemingly
|
212
|
+
seems
|
213
|
+
sep
|
214
|
+
set
|
215
|
+
several
|
216
|
+
she
|
217
|
+
should
|
218
|
+
similar
|
219
|
+
since
|
220
|
+
site
|
221
|
+
some
|
222
|
+
somehow
|
223
|
+
someone
|
224
|
+
something
|
225
|
+
sometime
|
226
|
+
sometimes
|
227
|
+
somewhat
|
228
|
+
somewhere
|
229
|
+
still
|
230
|
+
such
|
231
|
+
than
|
232
|
+
that
|
233
|
+
the
|
234
|
+
their
|
235
|
+
them
|
236
|
+
themselves
|
237
|
+
then
|
238
|
+
thence
|
239
|
+
thenceforth
|
240
|
+
there
|
241
|
+
thereafter
|
242
|
+
thereby
|
243
|
+
therefore
|
244
|
+
therein
|
245
|
+
thereupon
|
246
|
+
these
|
247
|
+
they
|
248
|
+
this
|
249
|
+
those
|
250
|
+
though
|
251
|
+
through
|
252
|
+
throughout
|
253
|
+
thru
|
254
|
+
thus
|
255
|
+
together
|
256
|
+
too
|
257
|
+
took
|
258
|
+
toward
|
259
|
+
towards
|
260
|
+
two
|
261
|
+
under
|
262
|
+
unless
|
263
|
+
unlike
|
264
|
+
unlikely
|
265
|
+
until
|
266
|
+
upon
|
267
|
+
url
|
268
|
+
use
|
269
|
+
used
|
270
|
+
using
|
271
|
+
usually
|
272
|
+
various
|
273
|
+
very
|
274
|
+
via
|
275
|
+
want
|
276
|
+
was
|
277
|
+
way
|
278
|
+
well
|
279
|
+
were
|
280
|
+
what
|
281
|
+
whatever
|
282
|
+
when
|
283
|
+
whence
|
284
|
+
whenever
|
285
|
+
where
|
286
|
+
whereafter
|
287
|
+
whereas
|
288
|
+
whereby
|
289
|
+
wherein
|
290
|
+
whereupon
|
291
|
+
wherever
|
292
|
+
whether
|
293
|
+
which
|
294
|
+
while
|
295
|
+
whither
|
296
|
+
who
|
297
|
+
whoever
|
298
|
+
whole
|
299
|
+
whom
|
300
|
+
whomever
|
301
|
+
whose
|
302
|
+
why
|
303
|
+
will
|
304
|
+
with
|
305
|
+
within
|
306
|
+
without
|
307
|
+
would
|
308
|
+
www
|
309
|
+
yes
|
310
|
+
yet
|
311
|
+
you
|
312
|
+
your
|
313
|
+
yours
|
314
|
+
yourself
|
315
|
+
yourselves
|
316
|
+
)
|
317
|
+
SPANISH = %w(
|
318
|
+
acuerdo
|
319
|
+
adelante
|
320
|
+
ademas
|
321
|
+
adrede
|
322
|
+
ahi
|
323
|
+
ahora
|
324
|
+
alli
|
325
|
+
alrededor
|
326
|
+
antano
|
327
|
+
ante
|
328
|
+
antes
|
329
|
+
apenas
|
330
|
+
aproximadamente
|
331
|
+
aquel
|
332
|
+
aquella
|
333
|
+
aquellas
|
334
|
+
aquello
|
335
|
+
aquellos
|
336
|
+
aqui
|
337
|
+
arribaabajo
|
338
|
+
asi
|
339
|
+
aun
|
340
|
+
aunque
|
341
|
+
bajo
|
342
|
+
bastante
|
343
|
+
bien
|
344
|
+
breve
|
345
|
+
casi
|
346
|
+
cerca
|
347
|
+
claro
|
348
|
+
como
|
349
|
+
con
|
350
|
+
conmigo
|
351
|
+
contigo
|
352
|
+
contra
|
353
|
+
cual
|
354
|
+
cuales
|
355
|
+
cuando
|
356
|
+
cuanta
|
357
|
+
cuantas
|
358
|
+
cuanto
|
359
|
+
cuantos
|
360
|
+
debajo
|
361
|
+
del
|
362
|
+
delante
|
363
|
+
demasiado
|
364
|
+
dentro
|
365
|
+
deprisa
|
366
|
+
desde
|
367
|
+
despacio
|
368
|
+
despues
|
369
|
+
detras
|
370
|
+
dia
|
371
|
+
dias
|
372
|
+
donde
|
373
|
+
dos
|
374
|
+
durante
|
375
|
+
ella
|
376
|
+
ellas
|
377
|
+
ellos
|
378
|
+
encima
|
379
|
+
enfrente
|
380
|
+
enseguida
|
381
|
+
entre
|
382
|
+
esa
|
383
|
+
esas
|
384
|
+
ese
|
385
|
+
eso
|
386
|
+
esos
|
387
|
+
esta
|
388
|
+
estado
|
389
|
+
estados
|
390
|
+
estan
|
391
|
+
estar
|
392
|
+
estas
|
393
|
+
este
|
394
|
+
esto
|
395
|
+
estos
|
396
|
+
excepto
|
397
|
+
final
|
398
|
+
fue
|
399
|
+
fuera
|
400
|
+
fueron
|
401
|
+
general
|
402
|
+
gran
|
403
|
+
habia
|
404
|
+
habla
|
405
|
+
hablan
|
406
|
+
hace
|
407
|
+
hacia
|
408
|
+
han
|
409
|
+
hasta
|
410
|
+
hay
|
411
|
+
horas
|
412
|
+
hoy
|
413
|
+
incluso
|
414
|
+
informo
|
415
|
+
junto
|
416
|
+
lado
|
417
|
+
las
|
418
|
+
lejos
|
419
|
+
los
|
420
|
+
luego
|
421
|
+
mal
|
422
|
+
mas
|
423
|
+
mayor
|
424
|
+
medio
|
425
|
+
mejor
|
426
|
+
menos
|
427
|
+
menudo
|
428
|
+
mia
|
429
|
+
mias
|
430
|
+
mientras
|
431
|
+
mio
|
432
|
+
mios
|
433
|
+
mis
|
434
|
+
mismo
|
435
|
+
mucho
|
436
|
+
muy
|
437
|
+
nada
|
438
|
+
nadie
|
439
|
+
ninguna
|
440
|
+
nos
|
441
|
+
nosotras
|
442
|
+
nosotros
|
443
|
+
nuestra
|
444
|
+
nuestras
|
445
|
+
nuestro
|
446
|
+
nuestros
|
447
|
+
nueva
|
448
|
+
nuevo
|
449
|
+
nunca
|
450
|
+
otra
|
451
|
+
otros
|
452
|
+
pais
|
453
|
+
para
|
454
|
+
parte
|
455
|
+
pasado
|
456
|
+
peor
|
457
|
+
pero
|
458
|
+
poco
|
459
|
+
por
|
460
|
+
porque
|
461
|
+
pronto
|
462
|
+
proximo
|
463
|
+
puede
|
464
|
+
qeu
|
465
|
+
que
|
466
|
+
quien
|
467
|
+
quienes
|
468
|
+
quiza
|
469
|
+
quizas
|
470
|
+
raras
|
471
|
+
repente
|
472
|
+
salvo
|
473
|
+
segun
|
474
|
+
ser
|
475
|
+
sera
|
476
|
+
sido
|
477
|
+
siempre
|
478
|
+
sin
|
479
|
+
sobre
|
480
|
+
solamente
|
481
|
+
solo
|
482
|
+
son
|
483
|
+
soyos
|
484
|
+
supuesto
|
485
|
+
sus
|
486
|
+
suya
|
487
|
+
suyas
|
488
|
+
suyo
|
489
|
+
tal
|
490
|
+
tambien
|
491
|
+
tampoco
|
492
|
+
tarde
|
493
|
+
temprano
|
494
|
+
tiene
|
495
|
+
todavia
|
496
|
+
todo
|
497
|
+
todos
|
498
|
+
tras
|
499
|
+
tus
|
500
|
+
tuya
|
501
|
+
tuyas
|
502
|
+
tuyo
|
503
|
+
tuyos
|
504
|
+
una
|
505
|
+
unas
|
506
|
+
uno
|
507
|
+
unos
|
508
|
+
usted
|
509
|
+
ustedes
|
510
|
+
veces
|
511
|
+
vez
|
512
|
+
vosotras
|
513
|
+
vosotros
|
514
|
+
vuestra
|
515
|
+
vuestras
|
516
|
+
vuestro
|
517
|
+
vuestros
|
518
|
+
tudo
|
519
|
+
dise
|
520
|
+
dicas
|
521
|
+
muito
|
522
|
+
)
|
523
|
+
FRENCH = %w(
|
524
|
+
des
|
525
|
+
les
|
526
|
+
mais
|
527
|
+
pour
|
528
|
+
)
|
529
|
+
COMMON = ENGLISH | SPANISH | FRENCH
|
530
|
+
ASIAN_SPACE_CHARS = [
|
531
|
+
'\302\267',
|
532
|
+
'\343\200\201',
|
533
|
+
'\343\200\202',
|
534
|
+
'\343\203\273',
|
535
|
+
'\357\274\201'
|
536
|
+
].join('|')
|
537
|
+
SPLIT_REGEX_STR = '[^[:word:]]|[[:punct:]]|' +
|
538
|
+
Constants::Words::ASIAN_SPACE_CHARS
|
539
|
+
SPLIT_REGEX = Regexp.new SPLIT_REGEX_STR.force_encoding('utf-8')
|
540
|
+
end
|
541
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'utils', 'grams')
|
2
|
+
|
3
|
+
class BagOfWords
|
4
|
+
include ::Utils::Grams
|
5
|
+
|
6
|
+
# DEBUG = false
|
7
|
+
# TODO: some gradient descent to choose this number
|
8
|
+
# 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
|
9
|
+
MIN_SUPP_L = 0.07
|
10
|
+
MIN_SUPP_H = 0.1
|
11
|
+
NUM_TOP_GRAMS = 250
|
12
|
+
MAX_BUCKETS = 8
|
13
|
+
|
14
|
+
# function worst case
|
15
|
+
# O(2 x (#frequent_grams x #gram_collections) + #all_grams + MAX_BUCKETS x #gram_collections)
|
16
|
+
def model(query, records_to_tokens)
|
17
|
+
@gram_cover_cache = {}
|
18
|
+
@gram_collections, @all_grams = create_grams(query, records_to_tokens)
|
19
|
+
|
20
|
+
top_grams = determine_frequency_term_sets(@all_grams, query)
|
21
|
+
top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
|
22
|
+
top_grams[gram_c1] <=> top_grams[gram_c2]
|
23
|
+
end.first(MAX_BUCKETS)
|
24
|
+
|
25
|
+
# below block, worst case O(MAX_BUCKETS x #gram_collections)
|
26
|
+
@gram_collections.inject({}) do |buckets, gram_collection|
|
27
|
+
max_fitness = 0
|
28
|
+
max_fit = nil
|
29
|
+
top_grams.each do |top_gram|
|
30
|
+
# the >= removes the 'none' possibility
|
31
|
+
if gram_collection.fitness[top_gram] && gram_collection.fitness[top_gram] >= max_fitness
|
32
|
+
max_fitness = gram_collection.fitness[top_gram]
|
33
|
+
max_fit = top_gram
|
34
|
+
end
|
35
|
+
end
|
36
|
+
buckets[max_fit] ||= []
|
37
|
+
buckets[max_fit] << gram_collection.content
|
38
|
+
buckets
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# ==== Return
|
43
|
+
# Hash - fitness => [gram_collection, ...]
|
44
|
+
# function worst case O(2 x (#frequent_grams x #gram_collections) + #all_grams)
|
45
|
+
def determine_frequency_term_sets(all_grams, query)
|
46
|
+
# only count a result if it has non-0 words length
|
47
|
+
effective_length = @gram_collections.reject do |result|
|
48
|
+
result.grams.nil? || result.grams.empty?
|
49
|
+
end.length
|
50
|
+
|
51
|
+
min_cover_l = MIN_SUPP_L * effective_length
|
52
|
+
# min_cover_h = MIN_SUPP_H * effective_length
|
53
|
+
|
54
|
+
# for speed only look at top N grams
|
55
|
+
# below block, worst case O(#all_grams)
|
56
|
+
frequent_grams = all_grams.sort do |gram1, gram2|
|
57
|
+
gram2.frequency <=> gram1.frequency
|
58
|
+
end.first(NUM_TOP_GRAMS)
|
59
|
+
|
60
|
+
# below block, worst case O(#frequent_grams x #gram_collections)
|
61
|
+
frequent_grams = frequent_grams.delete_if do |gram|
|
62
|
+
!cover(gram, min_cover_l)
|
63
|
+
end
|
64
|
+
|
65
|
+
# below block, worst case O(#frequent_grams x #gram_collections)
|
66
|
+
@gram_collections.inject(Hash.new(0)) do |top_grams, gram_collection|
|
67
|
+
max_fitness = 0
|
68
|
+
max_fit = nil
|
69
|
+
|
70
|
+
frequent_grams.each do |gram|
|
71
|
+
fitness = gram_collection.fitness[gram.content] = (gram_collection.content_to_frequency[gram.content] || 0) / gram.frequency.to_f
|
72
|
+
if fitness > max_fitness
|
73
|
+
max_fitness = fitness
|
74
|
+
max_fit = gram.content
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# puts "#{max_fit}: #{max_fitness}"# if DEBUG
|
79
|
+
top_grams[max_fit] += 1 if max_fit
|
80
|
+
top_grams
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# function worstcase O(#gram_collections)
|
85
|
+
def cover(gram, min_length)
|
86
|
+
((cached = @gram_cover_cache[gram]) != nil) and return cached
|
87
|
+
count = 0
|
88
|
+
@gram_collections.each do |gram_collection|
|
89
|
+
frequency = gram_collection.content_to_frequency[gram.content]
|
90
|
+
if !frequency.nil? && frequency > 0
|
91
|
+
count += 1
|
92
|
+
return @gram_cover_cache[gram] = true if count >= min_length
|
93
|
+
end
|
94
|
+
end
|
95
|
+
@gram_cover_cache[gram] = false
|
96
|
+
end
|
97
|
+
end
|
data/lib/utils/grams.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'gram_collection')
|
2
|
+
require File.join(File.dirname(__FILE__), 'gram_node')
|
3
|
+
|
4
|
+
module Utils
|
5
|
+
module Grams
|
6
|
+
def create_grams(query, records_to_words)
|
7
|
+
all_grams = []
|
8
|
+
@query = query
|
9
|
+
@query_terms = query.split.map(&:downcase).map(&:strip)
|
10
|
+
@query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
|
11
|
+
|
12
|
+
invalid = Proc.new do |gram, *args|
|
13
|
+
# remove [[gram]] if == [[query]]
|
14
|
+
gram == @query || gram == @query_alt || @query_terms.include?(gram)
|
15
|
+
end
|
16
|
+
|
17
|
+
gram_collections = records_to_words.map do |record, words|
|
18
|
+
gram_collection = GramCollection.new(record, words, invalid)
|
19
|
+
all_grams += gram_collection.grams
|
20
|
+
gram_collection
|
21
|
+
end
|
22
|
+
return gram_collections, make_grams_unique(all_grams)
|
23
|
+
end
|
24
|
+
|
25
|
+
def check_plurals(frequent_grams)
|
26
|
+
# if exists [[gram]] and [[gram]]s then remove [[gram]]s
|
27
|
+
frequent_grams_contents = frequent_grams.map(&:content)
|
28
|
+
frequent_grams.delete_if do |gram|
|
29
|
+
gram.content[-1] == 's' and
|
30
|
+
frequent_grams_contents.include?(gram.content[0...-1])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def make_grams_unique(grams)
|
35
|
+
grams.inject({}) do |hash, gram|
|
36
|
+
if hash[gram.content]
|
37
|
+
hash[gram.content].frequency += gram.frequency
|
38
|
+
else
|
39
|
+
hash[gram.content] = gram
|
40
|
+
end
|
41
|
+
hash
|
42
|
+
end.values
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: categorize
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Peter Lubell-Doughtie
|
9
|
+
- Helioid Inc.
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2012-06-28 00:00:00.000000000 Z
|
14
|
+
dependencies: []
|
15
|
+
description: Text categorization library
|
16
|
+
email: peter@helioid.com
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- lib/categorize.rb
|
22
|
+
- lib/constants.rb
|
23
|
+
- lib/models/bag_of_words.rb
|
24
|
+
- lib/utils/grams.rb
|
25
|
+
homepage: http://www.helioid.com/
|
26
|
+
licenses: []
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
none: false
|
33
|
+
requirements:
|
34
|
+
- - ! '>='
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: '0'
|
37
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ! '>='
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '0'
|
43
|
+
requirements: []
|
44
|
+
rubyforge_project:
|
45
|
+
rubygems_version: 1.8.25
|
46
|
+
signing_key:
|
47
|
+
specification_version: 3
|
48
|
+
summary: Text categorization library
|
49
|
+
test_files: []
|