categorize 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/categorize.rb ADDED
@@ -0,0 +1,49 @@
1
+ require File.join(File.dirname(__FILE__), 'models', 'bag_of_words')
2
+ require File.join(File.dirname(__FILE__), 'constants')
3
+
4
+ module Categorize
5
+ MIN_WORD_LENGTH = 3
6
+ @bag_of_words = BagOfWords.new
7
+
8
+ class << self
9
+ #include Bow
10
+ # ==== Return
11
+ # Hash - category => results
12
+ # ==== Parameters
13
+ # documents:: a list of documents to be classified
14
+ def make_model(query, documents, topic_model = @bag_of_words)
15
+ records_to_tokens = lexicalize(documents)
16
+ topic_model.model(query.downcase.strip, records_to_tokens)
17
+ end
18
+
19
+ # ==== Return
20
+ # Hash - category => results
21
+ # ==== Parameters
22
+ # items:: the items to be classified
23
+ def make_model_c(strings)
24
+ strings.map { |s| preprocess(s) }
25
+ #ret = model_bow(array_of_tokens);
26
+ count = 0
27
+ ret.inject({}) do |hash, term|
28
+ hash[term] ||= []
29
+ hash[term] << count += 1
30
+ hash
31
+ end
32
+ end
33
+
34
+ private
35
+ def lexicalize(strings)
36
+ Hash[
37
+ (0..(strings.length - 1)).zip(strings.map { |s| preprocess(s) })
38
+ ]
39
+ end
40
+
41
+ def preprocess(string)
42
+ string.split(Constants::Words::SPLIT_REGEX).map(&:downcase).delete_if do
43
+ |word|
44
+ word.length < MIN_WORD_LENGTH ||
45
+ Constants::Words::COMMON.include?(word)
46
+ end
47
+ end
48
+ end
49
+ end
data/lib/constants.rb ADDED
@@ -0,0 +1,541 @@
1
+ module Constants
2
+ module Words
3
+ # only include words > 2 chars
4
+ ENGLISH = %w(
5
+ 000
6
+ page
7
+ home
8
+ free
9
+ also
10
+ about
11
+ above
12
+ according
13
+ accordingly
14
+ across
15
+ after
16
+ afterward
17
+ afterwards
18
+ again
19
+ against
20
+ all
21
+ almost
22
+ alone
23
+ along
24
+ already
25
+ also
26
+ although
27
+ always
28
+ among
29
+ amongst
30
+ amp
31
+ and
32
+ another
33
+ any
34
+ anyhow
35
+ anyone
36
+ anything
37
+ anywhere
38
+ apr
39
+ are
40
+ aug
41
+ around
42
+ became
43
+ because
44
+ become
45
+ becomes
46
+ becoming
47
+ been
48
+ before
49
+ beforehand
50
+ began
51
+ behind
52
+ being
53
+ below
54
+ beside
55
+ besides
56
+ between
57
+ beyond
58
+ both
59
+ but
60
+ can
61
+ cannot
62
+ certain
63
+ com
64
+ could
65
+ days ago
66
+ dec
67
+ did
68
+ does
69
+ down
70
+ during
71
+ each
72
+ edu
73
+ either
74
+ else
75
+ elsewhere
76
+ enough
77
+ especially
78
+ est
79
+ etc
80
+ even
81
+ ever
82
+ every
83
+ everyone
84
+ everything
85
+ everywhere
86
+ example
87
+ except
88
+ feb
89
+ few
90
+ fewer
91
+ finally
92
+ find
93
+ following
94
+ for
95
+ former
96
+ formerly
97
+ from
98
+ further
99
+ furthermore
100
+ generally
101
+ get
102
+ given
103
+ had
104
+ has
105
+ have
106
+ having
107
+ hence
108
+ henceforth
109
+ her
110
+ here
111
+ hereafter
112
+ hereby
113
+ herein
114
+ hereupon
115
+ hers
116
+ herself
117
+ him
118
+ himself
119
+ his
120
+ hours ago
121
+ how
122
+ however
123
+ http
124
+ inc
125
+ include
126
+ included
127
+ includes
128
+ including
129
+ indeed
130
+ instead
131
+ into
132
+ its
133
+ itself
134
+ jan
135
+ jul
136
+ know
137
+ known
138
+ later
139
+ latterly
140
+ ldquo
141
+ llc
142
+ lquo
143
+ least
144
+ less
145
+ many
146
+ mar
147
+ may
148
+ maybe
149
+ mdash
150
+ meanwhile
151
+ might
152
+ miss
153
+ more
154
+ moreover
155
+ most
156
+ mostly
157
+ much
158
+ must
159
+ myself
160
+ nbsp
161
+ ndash
162
+ near
163
+ nearly
164
+ neither
165
+ never
166
+ nevertheless
167
+ next
168
+ nobody
169
+ non
170
+ none
171
+ nonetheless
172
+ nor
173
+ not
174
+ nothing
175
+ nov
176
+ now
177
+ nowhere
178
+ oct
179
+ off
180
+ often
181
+ once
182
+ one
183
+ only
184
+ onto
185
+ org
186
+ other
187
+ others
188
+ otherwise
189
+ our
190
+ ours
191
+ ourselves
192
+ out
193
+ over
194
+ overall
195
+ own
196
+ part
197
+ particularly
198
+ parts
199
+ per
200
+ perhaps
201
+ probably
202
+ quot
203
+ rather
204
+ rdquo
205
+ rquo
206
+ said
207
+ same
208
+ seem
209
+ seemed
210
+ seeming
211
+ seemingly
212
+ seems
213
+ sep
214
+ set
215
+ several
216
+ she
217
+ should
218
+ similar
219
+ since
220
+ site
221
+ some
222
+ somehow
223
+ someone
224
+ something
225
+ sometime
226
+ sometimes
227
+ somewhat
228
+ somewhere
229
+ still
230
+ such
231
+ than
232
+ that
233
+ the
234
+ their
235
+ them
236
+ themselves
237
+ then
238
+ thence
239
+ thenceforth
240
+ there
241
+ thereafter
242
+ thereby
243
+ therefore
244
+ therein
245
+ thereupon
246
+ these
247
+ they
248
+ this
249
+ those
250
+ though
251
+ through
252
+ throughout
253
+ thru
254
+ thus
255
+ together
256
+ too
257
+ took
258
+ toward
259
+ towards
260
+ two
261
+ under
262
+ unless
263
+ unlike
264
+ unlikely
265
+ until
266
+ upon
267
+ url
268
+ use
269
+ used
270
+ using
271
+ usually
272
+ various
273
+ very
274
+ via
275
+ want
276
+ was
277
+ way
278
+ well
279
+ were
280
+ what
281
+ whatever
282
+ when
283
+ whence
284
+ whenever
285
+ where
286
+ whereafter
287
+ whereas
288
+ whereby
289
+ wherein
290
+ whereupon
291
+ wherever
292
+ whether
293
+ which
294
+ while
295
+ whither
296
+ who
297
+ whoever
298
+ whole
299
+ whom
300
+ whomever
301
+ whose
302
+ why
303
+ will
304
+ with
305
+ within
306
+ without
307
+ would
308
+ www
309
+ yes
310
+ yet
311
+ you
312
+ your
313
+ yours
314
+ yourself
315
+ yourselves
316
+ )
317
+ SPANISH = %w(
318
+ acuerdo
319
+ adelante
320
+ ademas
321
+ adrede
322
+ ahi
323
+ ahora
324
+ alli
325
+ alrededor
326
+ antano
327
+ ante
328
+ antes
329
+ apenas
330
+ aproximadamente
331
+ aquel
332
+ aquella
333
+ aquellas
334
+ aquello
335
+ aquellos
336
+ aqui
337
+ arribaabajo
338
+ asi
339
+ aun
340
+ aunque
341
+ bajo
342
+ bastante
343
+ bien
344
+ breve
345
+ casi
346
+ cerca
347
+ claro
348
+ como
349
+ con
350
+ conmigo
351
+ contigo
352
+ contra
353
+ cual
354
+ cuales
355
+ cuando
356
+ cuanta
357
+ cuantas
358
+ cuanto
359
+ cuantos
360
+ debajo
361
+ del
362
+ delante
363
+ demasiado
364
+ dentro
365
+ deprisa
366
+ desde
367
+ despacio
368
+ despues
369
+ detras
370
+ dia
371
+ dias
372
+ donde
373
+ dos
374
+ durante
375
+ ella
376
+ ellas
377
+ ellos
378
+ encima
379
+ enfrente
380
+ enseguida
381
+ entre
382
+ esa
383
+ esas
384
+ ese
385
+ eso
386
+ esos
387
+ esta
388
+ estado
389
+ estados
390
+ estan
391
+ estar
392
+ estas
393
+ este
394
+ esto
395
+ estos
396
+ excepto
397
+ final
398
+ fue
399
+ fuera
400
+ fueron
401
+ general
402
+ gran
403
+ habia
404
+ habla
405
+ hablan
406
+ hace
407
+ hacia
408
+ han
409
+ hasta
410
+ hay
411
+ horas
412
+ hoy
413
+ incluso
414
+ informo
415
+ junto
416
+ lado
417
+ las
418
+ lejos
419
+ los
420
+ luego
421
+ mal
422
+ mas
423
+ mayor
424
+ medio
425
+ mejor
426
+ menos
427
+ menudo
428
+ mia
429
+ mias
430
+ mientras
431
+ mio
432
+ mios
433
+ mis
434
+ mismo
435
+ mucho
436
+ muy
437
+ nada
438
+ nadie
439
+ ninguna
440
+ nos
441
+ nosotras
442
+ nosotros
443
+ nuestra
444
+ nuestras
445
+ nuestro
446
+ nuestros
447
+ nueva
448
+ nuevo
449
+ nunca
450
+ otra
451
+ otros
452
+ pais
453
+ para
454
+ parte
455
+ pasado
456
+ peor
457
+ pero
458
+ poco
459
+ por
460
+ porque
461
+ pronto
462
+ proximo
463
+ puede
464
+ qeu
465
+ que
466
+ quien
467
+ quienes
468
+ quiza
469
+ quizas
470
+ raras
471
+ repente
472
+ salvo
473
+ segun
474
+ ser
475
+ sera
476
+ sido
477
+ siempre
478
+ sin
479
+ sobre
480
+ solamente
481
+ solo
482
+ son
483
+ soyos
484
+ supuesto
485
+ sus
486
+ suya
487
+ suyas
488
+ suyo
489
+ tal
490
+ tambien
491
+ tampoco
492
+ tarde
493
+ temprano
494
+ tiene
495
+ todavia
496
+ todo
497
+ todos
498
+ tras
499
+ tus
500
+ tuya
501
+ tuyas
502
+ tuyo
503
+ tuyos
504
+ una
505
+ unas
506
+ uno
507
+ unos
508
+ usted
509
+ ustedes
510
+ veces
511
+ vez
512
+ vosotras
513
+ vosotros
514
+ vuestra
515
+ vuestras
516
+ vuestro
517
+ vuestros
518
+ tudo
519
+ dise
520
+ dicas
521
+ muito
522
+ )
523
+ FRENCH = %w(
524
+ des
525
+ les
526
+ mais
527
+ pour
528
+ )
529
+ COMMON = ENGLISH | SPANISH | FRENCH
530
+ ASIAN_SPACE_CHARS = [
531
+ '\302\267',
532
+ '\343\200\201',
533
+ '\343\200\202',
534
+ '\343\203\273',
535
+ '\357\274\201'
536
+ ].join('|')
537
+ SPLIT_REGEX_STR = '[^[:word:]]|[[:punct:]]|' +
538
+ Constants::Words::ASIAN_SPACE_CHARS
539
+ SPLIT_REGEX = Regexp.new SPLIT_REGEX_STR.force_encoding('utf-8')
540
+ end
541
+ end
@@ -0,0 +1,97 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'utils', 'grams')
2
+
3
+ class BagOfWords
4
+ include ::Utils::Grams
5
+
6
+ # DEBUG = false
7
+ # TODO: some gradient descent to choose this number
8
+ # 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
9
+ MIN_SUPP_L = 0.07
10
+ MIN_SUPP_H = 0.1
11
+ NUM_TOP_GRAMS = 250
12
+ MAX_BUCKETS = 8
13
+
14
+ # function worst case
15
+ # O(2 x (#frequent_grams x #gram_collections) + #all_grams + MAX_BUCKETS x #gram_collections)
16
+ def model(query, records_to_tokens)
17
+ @gram_cover_cache = {}
18
+ @gram_collections, @all_grams = create_grams(query, records_to_tokens)
19
+
20
+ top_grams = determine_frequency_term_sets(@all_grams, query)
21
+ top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
22
+ top_grams[gram_c1] <=> top_grams[gram_c2]
23
+ end.first(MAX_BUCKETS)
24
+
25
+ # below block, worst case O(MAX_BUCKETS x #gram_collections)
26
+ @gram_collections.inject({}) do |buckets, gram_collection|
27
+ max_fitness = 0
28
+ max_fit = nil
29
+ top_grams.each do |top_gram|
30
+ # the >= removes the 'none' possibility
31
+ if gram_collection.fitness[top_gram] && gram_collection.fitness[top_gram] >= max_fitness
32
+ max_fitness = gram_collection.fitness[top_gram]
33
+ max_fit = top_gram
34
+ end
35
+ end
36
+ buckets[max_fit] ||= []
37
+ buckets[max_fit] << gram_collection.content
38
+ buckets
39
+ end
40
+ end
41
+
42
+ # ==== Return
43
+ # Hash - fitness => [gram_collection, ...]
44
+ # function worst case O(2 x (#frequent_grams x #gram_collections) + #all_grams)
45
+ def determine_frequency_term_sets(all_grams, query)
46
+ # only count a result if it has non-0 words length
47
+ effective_length = @gram_collections.reject do |result|
48
+ result.grams.nil? || result.grams.empty?
49
+ end.length
50
+
51
+ min_cover_l = MIN_SUPP_L * effective_length
52
+ # min_cover_h = MIN_SUPP_H * effective_length
53
+
54
+ # for speed only look at top N grams
55
+ # below block, worst case O(#all_grams)
56
+ frequent_grams = all_grams.sort do |gram1, gram2|
57
+ gram2.frequency <=> gram1.frequency
58
+ end.first(NUM_TOP_GRAMS)
59
+
60
+ # below block, worst case O(#frequent_grams x #gram_collections)
61
+ frequent_grams = frequent_grams.delete_if do |gram|
62
+ !cover(gram, min_cover_l)
63
+ end
64
+
65
+ # below block, worst case O(#frequent_grams x #gram_collections)
66
+ @gram_collections.inject(Hash.new(0)) do |top_grams, gram_collection|
67
+ max_fitness = 0
68
+ max_fit = nil
69
+
70
+ frequent_grams.each do |gram|
71
+ fitness = gram_collection.fitness[gram.content] = (gram_collection.content_to_frequency[gram.content] || 0) / gram.frequency.to_f
72
+ if fitness > max_fitness
73
+ max_fitness = fitness
74
+ max_fit = gram.content
75
+ end
76
+ end
77
+
78
+ # puts "#{max_fit}: #{max_fitness}"# if DEBUG
79
+ top_grams[max_fit] += 1 if max_fit
80
+ top_grams
81
+ end
82
+ end
83
+
84
+ # function worstcase O(#gram_collections)
85
+ def cover(gram, min_length)
86
+ ((cached = @gram_cover_cache[gram]) != nil) and return cached
87
+ count = 0
88
+ @gram_collections.each do |gram_collection|
89
+ frequency = gram_collection.content_to_frequency[gram.content]
90
+ if !frequency.nil? && frequency > 0
91
+ count += 1
92
+ return @gram_cover_cache[gram] = true if count >= min_length
93
+ end
94
+ end
95
+ @gram_cover_cache[gram] = false
96
+ end
97
+ end
@@ -0,0 +1,45 @@
1
+ require File.join(File.dirname(__FILE__), 'gram_collection')
2
+ require File.join(File.dirname(__FILE__), 'gram_node')
3
+
4
+ module Utils
5
+ module Grams
6
+ def create_grams(query, records_to_words)
7
+ all_grams = []
8
+ @query = query
9
+ @query_terms = query.split.map(&:downcase).map(&:strip)
10
+ @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
11
+
12
+ invalid = Proc.new do |gram, *args|
13
+ # remove [[gram]] if == [[query]]
14
+ gram == @query || gram == @query_alt || @query_terms.include?(gram)
15
+ end
16
+
17
+ gram_collections = records_to_words.map do |record, words|
18
+ gram_collection = GramCollection.new(record, words, invalid)
19
+ all_grams += gram_collection.grams
20
+ gram_collection
21
+ end
22
+ return gram_collections, make_grams_unique(all_grams)
23
+ end
24
+
25
+ def check_plurals(frequent_grams)
26
+ # if exists [[gram]] and [[gram]]s then remove [[gram]]s
27
+ frequent_grams_contents = frequent_grams.map(&:content)
28
+ frequent_grams.delete_if do |gram|
29
+ gram.content[-1] == 's' and
30
+ frequent_grams_contents.include?(gram.content[0...-1])
31
+ end
32
+ end
33
+
34
+ def make_grams_unique(grams)
35
+ grams.inject({}) do |hash, gram|
36
+ if hash[gram.content]
37
+ hash[gram.content].frequency += gram.frequency
38
+ else
39
+ hash[gram.content] = gram
40
+ end
41
+ hash
42
+ end.values
43
+ end
44
+ end
45
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: categorize
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Peter Lubell-Doughtie
9
+ - Helioid Inc.
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2012-06-28 00:00:00.000000000 Z
14
+ dependencies: []
15
+ description: Text categorization library
16
+ email: peter@helioid.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - lib/categorize.rb
22
+ - lib/constants.rb
23
+ - lib/models/bag_of_words.rb
24
+ - lib/utils/grams.rb
25
+ homepage: http://www.helioid.com/
26
+ licenses: []
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ none: false
33
+ requirements:
34
+ - - ! '>='
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ requirements: []
44
+ rubyforge_project:
45
+ rubygems_version: 1.8.25
46
+ signing_key:
47
+ specification_version: 3
48
+ summary: Text categorization library
49
+ test_files: []