categorize 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/categorize.rb ADDED
@@ -0,0 +1,49 @@
1
+ require File.join(File.dirname(__FILE__), 'models', 'bag_of_words')
2
+ require File.join(File.dirname(__FILE__), 'constants')
3
+
4
+ module Categorize
5
+ MIN_WORD_LENGTH = 3
6
+ @bag_of_words = BagOfWords.new
7
+
8
+ class << self
9
+ #include Bow
10
+ # ==== Return
11
+ # Hash - category => results
12
+ # ==== Parameters
13
+ # documents:: a list of documents to be classified
14
+ def make_model(query, documents, topic_model = @bag_of_words)
15
+ records_to_tokens = lexicalize(documents)
16
+ topic_model.model(query.downcase.strip, records_to_tokens)
17
+ end
18
+
19
+ # ==== Return
20
+ # Hash - category => results
21
+ # ==== Parameters
22
+ # items:: the items to be classified
23
+ def make_model_c(strings)
24
+ strings.map { |s| preprocess(s) }
25
+ #ret = model_bow(array_of_tokens);
26
+ count = 0
27
+ ret.inject({}) do |hash, term|
28
+ hash[term] ||= []
29
+ hash[term] << count += 1
30
+ hash
31
+ end
32
+ end
33
+
34
+ private
35
+ def lexicalize(strings)
36
+ Hash[
37
+ (0..(strings.length - 1)).zip(strings.map { |s| preprocess(s) })
38
+ ]
39
+ end
40
+
41
+ def preprocess(string)
42
+ string.split(Constants::Words::SPLIT_REGEX).map(&:downcase).delete_if do
43
+ |word|
44
+ word.length < MIN_WORD_LENGTH ||
45
+ Constants::Words::COMMON.include?(word)
46
+ end
47
+ end
48
+ end
49
+ end
data/lib/constants.rb ADDED
@@ -0,0 +1,541 @@
1
+ module Constants
2
+ module Words
3
+ # only include words > 2 chars
4
+ ENGLISH = %w(
5
+ 000
6
+ page
7
+ home
8
+ free
9
+ also
10
+ about
11
+ above
12
+ according
13
+ accordingly
14
+ across
15
+ after
16
+ afterward
17
+ afterwards
18
+ again
19
+ against
20
+ all
21
+ almost
22
+ alone
23
+ along
24
+ already
25
+ also
26
+ although
27
+ always
28
+ among
29
+ amongst
30
+ amp
31
+ and
32
+ another
33
+ any
34
+ anyhow
35
+ anyone
36
+ anything
37
+ anywhere
38
+ apr
39
+ are
40
+ aug
41
+ around
42
+ became
43
+ because
44
+ become
45
+ becomes
46
+ becoming
47
+ been
48
+ before
49
+ beforehand
50
+ began
51
+ behind
52
+ being
53
+ below
54
+ beside
55
+ besides
56
+ between
57
+ beyond
58
+ both
59
+ but
60
+ can
61
+ cannot
62
+ certain
63
+ com
64
+ could
65
+ days ago
66
+ dec
67
+ did
68
+ does
69
+ down
70
+ during
71
+ each
72
+ edu
73
+ either
74
+ else
75
+ elsewhere
76
+ enough
77
+ especially
78
+ est
79
+ etc
80
+ even
81
+ ever
82
+ every
83
+ everyone
84
+ everything
85
+ everywhere
86
+ example
87
+ except
88
+ feb
89
+ few
90
+ fewer
91
+ finally
92
+ find
93
+ following
94
+ for
95
+ former
96
+ formerly
97
+ from
98
+ further
99
+ furthermore
100
+ generally
101
+ get
102
+ given
103
+ had
104
+ has
105
+ have
106
+ having
107
+ hence
108
+ henceforth
109
+ her
110
+ here
111
+ hereafter
112
+ hereby
113
+ herein
114
+ hereupon
115
+ hers
116
+ herself
117
+ him
118
+ himself
119
+ his
120
+ hours ago
121
+ how
122
+ however
123
+ http
124
+ inc
125
+ include
126
+ included
127
+ includes
128
+ including
129
+ indeed
130
+ instead
131
+ into
132
+ its
133
+ itself
134
+ jan
135
+ jul
136
+ know
137
+ known
138
+ later
139
+ latterly
140
+ ldquo
141
+ llc
142
+ lquo
143
+ least
144
+ less
145
+ many
146
+ mar
147
+ may
148
+ maybe
149
+ mdash
150
+ meanwhile
151
+ might
152
+ miss
153
+ more
154
+ moreover
155
+ most
156
+ mostly
157
+ much
158
+ must
159
+ myself
160
+ nbsp
161
+ ndash
162
+ near
163
+ nearly
164
+ neither
165
+ never
166
+ nevertheless
167
+ next
168
+ nobody
169
+ non
170
+ none
171
+ nonetheless
172
+ nor
173
+ not
174
+ nothing
175
+ nov
176
+ now
177
+ nowhere
178
+ oct
179
+ off
180
+ often
181
+ once
182
+ one
183
+ only
184
+ onto
185
+ org
186
+ other
187
+ others
188
+ otherwise
189
+ our
190
+ ours
191
+ ourselves
192
+ out
193
+ over
194
+ overall
195
+ own
196
+ part
197
+ particularly
198
+ parts
199
+ per
200
+ perhaps
201
+ probably
202
+ quot
203
+ rather
204
+ rdquo
205
+ rquo
206
+ said
207
+ same
208
+ seem
209
+ seemed
210
+ seeming
211
+ seemingly
212
+ seems
213
+ sep
214
+ set
215
+ several
216
+ she
217
+ should
218
+ similar
219
+ since
220
+ site
221
+ some
222
+ somehow
223
+ someone
224
+ something
225
+ sometime
226
+ sometimes
227
+ somewhat
228
+ somewhere
229
+ still
230
+ such
231
+ than
232
+ that
233
+ the
234
+ their
235
+ them
236
+ themselves
237
+ then
238
+ thence
239
+ thenceforth
240
+ there
241
+ thereafter
242
+ thereby
243
+ therefore
244
+ therein
245
+ thereupon
246
+ these
247
+ they
248
+ this
249
+ those
250
+ though
251
+ through
252
+ throughout
253
+ thru
254
+ thus
255
+ together
256
+ too
257
+ took
258
+ toward
259
+ towards
260
+ two
261
+ under
262
+ unless
263
+ unlike
264
+ unlikely
265
+ until
266
+ upon
267
+ url
268
+ use
269
+ used
270
+ using
271
+ usually
272
+ various
273
+ very
274
+ via
275
+ want
276
+ was
277
+ way
278
+ well
279
+ were
280
+ what
281
+ whatever
282
+ when
283
+ whence
284
+ whenever
285
+ where
286
+ whereafter
287
+ whereas
288
+ whereby
289
+ wherein
290
+ whereupon
291
+ wherever
292
+ whether
293
+ which
294
+ while
295
+ whither
296
+ who
297
+ whoever
298
+ whole
299
+ whom
300
+ whomever
301
+ whose
302
+ why
303
+ will
304
+ with
305
+ within
306
+ without
307
+ would
308
+ www
309
+ yes
310
+ yet
311
+ you
312
+ your
313
+ yours
314
+ yourself
315
+ yourselves
316
+ )
317
+ SPANISH = %w(
318
+ acuerdo
319
+ adelante
320
+ ademas
321
+ adrede
322
+ ahi
323
+ ahora
324
+ alli
325
+ alrededor
326
+ antano
327
+ ante
328
+ antes
329
+ apenas
330
+ aproximadamente
331
+ aquel
332
+ aquella
333
+ aquellas
334
+ aquello
335
+ aquellos
336
+ aqui
337
+ arribaabajo
338
+ asi
339
+ aun
340
+ aunque
341
+ bajo
342
+ bastante
343
+ bien
344
+ breve
345
+ casi
346
+ cerca
347
+ claro
348
+ como
349
+ con
350
+ conmigo
351
+ contigo
352
+ contra
353
+ cual
354
+ cuales
355
+ cuando
356
+ cuanta
357
+ cuantas
358
+ cuanto
359
+ cuantos
360
+ debajo
361
+ del
362
+ delante
363
+ demasiado
364
+ dentro
365
+ deprisa
366
+ desde
367
+ despacio
368
+ despues
369
+ detras
370
+ dia
371
+ dias
372
+ donde
373
+ dos
374
+ durante
375
+ ella
376
+ ellas
377
+ ellos
378
+ encima
379
+ enfrente
380
+ enseguida
381
+ entre
382
+ esa
383
+ esas
384
+ ese
385
+ eso
386
+ esos
387
+ esta
388
+ estado
389
+ estados
390
+ estan
391
+ estar
392
+ estas
393
+ este
394
+ esto
395
+ estos
396
+ excepto
397
+ final
398
+ fue
399
+ fuera
400
+ fueron
401
+ general
402
+ gran
403
+ habia
404
+ habla
405
+ hablan
406
+ hace
407
+ hacia
408
+ han
409
+ hasta
410
+ hay
411
+ horas
412
+ hoy
413
+ incluso
414
+ informo
415
+ junto
416
+ lado
417
+ las
418
+ lejos
419
+ los
420
+ luego
421
+ mal
422
+ mas
423
+ mayor
424
+ medio
425
+ mejor
426
+ menos
427
+ menudo
428
+ mia
429
+ mias
430
+ mientras
431
+ mio
432
+ mios
433
+ mis
434
+ mismo
435
+ mucho
436
+ muy
437
+ nada
438
+ nadie
439
+ ninguna
440
+ nos
441
+ nosotras
442
+ nosotros
443
+ nuestra
444
+ nuestras
445
+ nuestro
446
+ nuestros
447
+ nueva
448
+ nuevo
449
+ nunca
450
+ otra
451
+ otros
452
+ pais
453
+ para
454
+ parte
455
+ pasado
456
+ peor
457
+ pero
458
+ poco
459
+ por
460
+ porque
461
+ pronto
462
+ proximo
463
+ puede
464
+ qeu
465
+ que
466
+ quien
467
+ quienes
468
+ quiza
469
+ quizas
470
+ raras
471
+ repente
472
+ salvo
473
+ segun
474
+ ser
475
+ sera
476
+ sido
477
+ siempre
478
+ sin
479
+ sobre
480
+ solamente
481
+ solo
482
+ son
483
+ soyos
484
+ supuesto
485
+ sus
486
+ suya
487
+ suyas
488
+ suyo
489
+ tal
490
+ tambien
491
+ tampoco
492
+ tarde
493
+ temprano
494
+ tiene
495
+ todavia
496
+ todo
497
+ todos
498
+ tras
499
+ tus
500
+ tuya
501
+ tuyas
502
+ tuyo
503
+ tuyos
504
+ una
505
+ unas
506
+ uno
507
+ unos
508
+ usted
509
+ ustedes
510
+ veces
511
+ vez
512
+ vosotras
513
+ vosotros
514
+ vuestra
515
+ vuestras
516
+ vuestro
517
+ vuestros
518
+ tudo
519
+ dise
520
+ dicas
521
+ muito
522
+ )
523
+ FRENCH = %w(
524
+ des
525
+ les
526
+ mais
527
+ pour
528
+ )
529
+ COMMON = ENGLISH | SPANISH | FRENCH
530
+ ASIAN_SPACE_CHARS = [
531
+ '\302\267',
532
+ '\343\200\201',
533
+ '\343\200\202',
534
+ '\343\203\273',
535
+ '\357\274\201'
536
+ ].join('|')
537
+ SPLIT_REGEX_STR = '[^[:word:]]|[[:punct:]]|' +
538
+ Constants::Words::ASIAN_SPACE_CHARS
539
+ SPLIT_REGEX = Regexp.new SPLIT_REGEX_STR.force_encoding('utf-8')
540
+ end
541
+ end
@@ -0,0 +1,97 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'utils', 'grams')
2
+
3
+ class BagOfWords
4
+ include ::Utils::Grams
5
+
6
+ # DEBUG = false
7
+ # TODO: some gradient descent to choose this number
8
+ # 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
9
+ MIN_SUPP_L = 0.07
10
+ MIN_SUPP_H = 0.1
11
+ NUM_TOP_GRAMS = 250
12
+ MAX_BUCKETS = 8
13
+
14
+ # function worst case
15
+ # O(2 x (#frequent_grams x #gram_collections) + #all_grams + MAX_BUCKETS x #gram_collections)
16
+ def model(query, records_to_tokens)
17
+ @gram_cover_cache = {}
18
+ @gram_collections, @all_grams = create_grams(query, records_to_tokens)
19
+
20
+ top_grams = determine_frequency_term_sets(@all_grams, query)
21
+ top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
22
+ top_grams[gram_c1] <=> top_grams[gram_c2]
23
+ end.first(MAX_BUCKETS)
24
+
25
+ # below block, worst case O(MAX_BUCKETS x #gram_collections)
26
+ @gram_collections.inject({}) do |buckets, gram_collection|
27
+ max_fitness = 0
28
+ max_fit = nil
29
+ top_grams.each do |top_gram|
30
+ # the >= removes the 'none' possibility
31
+ if gram_collection.fitness[top_gram] && gram_collection.fitness[top_gram] >= max_fitness
32
+ max_fitness = gram_collection.fitness[top_gram]
33
+ max_fit = top_gram
34
+ end
35
+ end
36
+ buckets[max_fit] ||= []
37
+ buckets[max_fit] << gram_collection.content
38
+ buckets
39
+ end
40
+ end
41
+
42
+ # ==== Return
43
+ # Hash - fitness => [gram_collection, ...]
44
+ # function worst case O(2 x (#frequent_grams x #gram_collections) + #all_grams)
45
+ def determine_frequency_term_sets(all_grams, query)
46
+ # only count a result if it has non-0 words length
47
+ effective_length = @gram_collections.reject do |result|
48
+ result.grams.nil? || result.grams.empty?
49
+ end.length
50
+
51
+ min_cover_l = MIN_SUPP_L * effective_length
52
+ # min_cover_h = MIN_SUPP_H * effective_length
53
+
54
+ # for speed only look at top N grams
55
+ # below block, worst case O(#all_grams)
56
+ frequent_grams = all_grams.sort do |gram1, gram2|
57
+ gram2.frequency <=> gram1.frequency
58
+ end.first(NUM_TOP_GRAMS)
59
+
60
+ # below block, worst case O(#frequent_grams x #gram_collections)
61
+ frequent_grams = frequent_grams.delete_if do |gram|
62
+ !cover(gram, min_cover_l)
63
+ end
64
+
65
+ # below block, worst case O(#frequent_grams x #gram_collections)
66
+ @gram_collections.inject(Hash.new(0)) do |top_grams, gram_collection|
67
+ max_fitness = 0
68
+ max_fit = nil
69
+
70
+ frequent_grams.each do |gram|
71
+ fitness = gram_collection.fitness[gram.content] = (gram_collection.content_to_frequency[gram.content] || 0) / gram.frequency.to_f
72
+ if fitness > max_fitness
73
+ max_fitness = fitness
74
+ max_fit = gram.content
75
+ end
76
+ end
77
+
78
+ # puts "#{max_fit}: #{max_fitness}"# if DEBUG
79
+ top_grams[max_fit] += 1 if max_fit
80
+ top_grams
81
+ end
82
+ end
83
+
84
+ # function worstcase O(#gram_collections)
85
+ def cover(gram, min_length)
86
+ ((cached = @gram_cover_cache[gram]) != nil) and return cached
87
+ count = 0
88
+ @gram_collections.each do |gram_collection|
89
+ frequency = gram_collection.content_to_frequency[gram.content]
90
+ if !frequency.nil? && frequency > 0
91
+ count += 1
92
+ return @gram_cover_cache[gram] = true if count >= min_length
93
+ end
94
+ end
95
+ @gram_cover_cache[gram] = false
96
+ end
97
+ end
@@ -0,0 +1,45 @@
1
+ require File.join(File.dirname(__FILE__), 'gram_collection')
2
+ require File.join(File.dirname(__FILE__), 'gram_node')
3
+
4
+ module Utils
5
+ module Grams
6
+ def create_grams(query, records_to_words)
7
+ all_grams = []
8
+ @query = query
9
+ @query_terms = query.split.map(&:downcase).map(&:strip)
10
+ @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
11
+
12
+ invalid = Proc.new do |gram, *args|
13
+ # remove [[gram]] if == [[query]]
14
+ gram == @query || gram == @query_alt || @query_terms.include?(gram)
15
+ end
16
+
17
+ gram_collections = records_to_words.map do |record, words|
18
+ gram_collection = GramCollection.new(record, words, invalid)
19
+ all_grams += gram_collection.grams
20
+ gram_collection
21
+ end
22
+ return gram_collections, make_grams_unique(all_grams)
23
+ end
24
+
25
+ def check_plurals(frequent_grams)
26
+ # if exists [[gram]] and [[gram]]s then remove [[gram]]s
27
+ frequent_grams_contents = frequent_grams.map(&:content)
28
+ frequent_grams.delete_if do |gram|
29
+ gram.content[-1] == 's' and
30
+ frequent_grams_contents.include?(gram.content[0...-1])
31
+ end
32
+ end
33
+
34
+ def make_grams_unique(grams)
35
+ grams.inject({}) do |hash, gram|
36
+ if hash[gram.content]
37
+ hash[gram.content].frequency += gram.frequency
38
+ else
39
+ hash[gram.content] = gram
40
+ end
41
+ hash
42
+ end.values
43
+ end
44
+ end
45
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: categorize
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Peter Lubell-Doughtie
9
+ - Helioid Inc.
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2012-06-28 00:00:00.000000000 Z
14
+ dependencies: []
15
+ description: Text categorization library
16
+ email: peter@helioid.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - lib/categorize.rb
22
+ - lib/constants.rb
23
+ - lib/models/bag_of_words.rb
24
+ - lib/utils/grams.rb
25
+ homepage: http://www.helioid.com/
26
+ licenses: []
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ none: false
33
+ requirements:
34
+ - - ! '>='
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ requirements: []
44
+ rubyforge_project:
45
+ rubygems_version: 1.8.25
46
+ signing_key:
47
+ specification_version: 3
48
+ summary: Text categorization library
49
+ test_files: []