categorize 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  module Constants
2
4
  module Words
3
5
  # only include words > 2 chars
@@ -1,9 +1,8 @@
1
- require File.join(File.dirname(__FILE__), 'models', 'bag_of_words')
2
- require File.join(File.dirname(__FILE__), 'constants')
1
+ # encoding: utf-8
3
2
 
4
3
  module Categorize
5
4
  MIN_WORD_LENGTH = 3
6
- @bag_of_words = BagOfWords.new
5
+ @bag_of_words = Models::BagOfWords.new
7
6
 
8
7
  class << self
9
8
  #include Bow
@@ -24,7 +23,7 @@ module Categorize
24
23
  strings.map { |s| preprocess(s) }
25
24
  #ret = model_bow(array_of_tokens);
26
25
  count = 0
27
- ret.inject({}) do |hash, term|
26
+ ret.reduce({}) do |hash, term|
28
27
  hash[term] ||= []
29
28
  hash[term] << count += 1
30
29
  hash
@@ -39,8 +38,9 @@ module Categorize
39
38
  end
40
39
 
41
40
  def preprocess(string)
42
- string.split(Constants::Words::SPLIT_REGEX).map(&:downcase).delete_if do
43
- |word|
41
+ split_lower_strings = string.split(
42
+ Constants::Words::SPLIT_REGEX).map(&:downcase)
43
+ split_lower_strings.delete_if do |word|
44
44
  word.length < MIN_WORD_LENGTH ||
45
45
  Constants::Words::COMMON.include?(word)
46
46
  end
@@ -0,0 +1,112 @@
1
+ # encoding: utf-8
2
+
3
+ require 'categorize/utils/grams'
4
+
5
+ module Categorize
6
+ module Models
7
+ class BagOfWords
8
+ include Utils::Grams
9
+
10
+ # DEBUG = false
11
+ # TODO: some gradient descent to choose this number
12
+ # 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
13
+ MIN_SUPP_L = 0.07
14
+ MIN_SUPP_H = 0.1
15
+ NUM_TOP_GRAMS = 250
16
+ MAX_BUCKETS = 8
17
+
18
+ # function worst case
19
+ # O(2 x (|frequent_grams| x |gram_collections|) +
20
+ # |all_grams| + MAX_BUCKETS x |gram_collections|)
21
+ def model(query, records_to_tokens)
22
+ @gram_cover_cache = {}
23
+ @gram_collections, @all_grams = create_grams(query, records_to_tokens)
24
+
25
+ top_grams = determine_frequency_term_sets(@all_grams, query)
26
+ top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
27
+ top_grams[gram_c1] <=> top_grams[gram_c2]
28
+ end.first(MAX_BUCKETS)
29
+
30
+ # below block, worst case O(MAX_BUCKETS x |gram_collections|)
31
+ @gram_collections.reduce({}) do |buckets, gram_collection|
32
+ max_fitness = 0
33
+ max_fit = nil
34
+ top_grams.each do |top_gram|
35
+ # the >= removes the 'none' possibility
36
+ if gram_collection.fitness[top_gram] &&
37
+ gram_collection.fitness[top_gram] >= max_fitness
38
+ max_fitness = gram_collection.fitness[top_gram]
39
+ max_fit = top_gram
40
+ end
41
+ end
42
+ buckets[max_fit] ||= []
43
+ buckets[max_fit] << gram_collection.content
44
+ buckets
45
+ end
46
+ end
47
+
48
+ # ==== Return
49
+ # Hash - fitness => [gram_collection, ...]
50
+ # function worst case O(2 x (|frequent_grams| x |gram_collections|) +
51
+ # |all_grams|)
52
+ def determine_frequency_term_sets(all_grams, query)
53
+ # only count a result if it has non-0 words length
54
+ effective_length = @gram_collections.reject do |result|
55
+ result.grams.nil? || result.grams.empty?
56
+ end.length
57
+
58
+ min_cover_l = MIN_SUPP_L * effective_length
59
+ # min_cover_h = MIN_SUPP_H * effective_length
60
+
61
+ # for speed only look at top N grams
62
+ # below block, worst case O(|all_grams|)
63
+ frequent_grams = all_grams.sort do |gram1, gram2|
64
+ gram2.frequency <=> gram1.frequency
65
+ end.first(NUM_TOP_GRAMS)
66
+
67
+ # below block, worst case O(|frequent_grams| x |gram_collections|)
68
+ frequent_grams = frequent_grams.delete_if do |gram|
69
+ !cover(gram, min_cover_l)
70
+ end
71
+
72
+ # below block, worst case O(|frequent_grams| x |gram_collections|)
73
+ @gram_collections.reduce(Hash.new(0)) do |top_grams, gram_collection|
74
+ max_fitness = 0
75
+ max_fit = nil
76
+
77
+ frequent_grams.each do |gram|
78
+ content_frequency = (
79
+ gram_collection.content_to_frequency[gram.content] || 0)
80
+ fitness = content_frequency / gram.frequency.to_f
81
+ gram_collection.fitness[gram.content] = fitness
82
+
83
+ if fitness > max_fitness
84
+ max_fitness = fitness
85
+ max_fit = gram.content
86
+ end
87
+ end
88
+
89
+ # puts "#{max_fit}: #{max_fitness}"# if DEBUG
90
+ top_grams[max_fit] += 1 if max_fit
91
+ top_grams
92
+ end
93
+ end
94
+
95
+ # function worstcase O(#gram_collections)
96
+ def cover(gram, min_length)
97
+ ((cached = @gram_cover_cache[gram]) != nil) and return cached
98
+ count = 0
99
+
100
+ @gram_collections.each do |gram_collection|
101
+ frequency = gram_collection.content_to_frequency[gram.content]
102
+ if !frequency.nil? && frequency > 0
103
+ count += 1
104
+ return @gram_cover_cache[gram] = true if count >= min_length
105
+ end
106
+ end
107
+
108
+ @gram_cover_cache[gram] = false
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,46 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Utils
5
+ module Grams
6
+ def create_grams(query, records_to_words)
7
+ all_grams = []
8
+ @query = query
9
+ @query_terms = query.split.map(&:downcase).map(&:strip)
10
+ @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
11
+
12
+ invalid = Proc.new do |gram, *args|
13
+ # remove [[gram]] if == [[query]]
14
+ gram == @query || gram == @query_alt || @query_terms.include?(gram)
15
+ end
16
+
17
+ gram_collections = records_to_words.map do |record, words|
18
+ gram_collection = GramCollection.new(record, words, invalid)
19
+ all_grams += gram_collection.grams
20
+ gram_collection
21
+ end
22
+ return gram_collections, make_grams_unique(all_grams)
23
+ end
24
+
25
+ def check_plurals(frequent_grams)
26
+ # if exists [[gram]] and [[gram]]s then remove [[gram]]s
27
+ frequent_grams_contents = frequent_grams.map(&:content)
28
+ frequent_grams.delete_if do |gram|
29
+ gram.content[-1] == 's' and
30
+ frequent_grams_contents.include?(gram.content[0...-1])
31
+ end
32
+ end
33
+
34
+ def make_grams_unique(grams)
35
+ grams.reduce({}) do |hash, gram|
36
+ if hash[gram.content]
37
+ hash[gram.content].frequency += gram.frequency
38
+ else
39
+ hash[gram.content] = gram
40
+ end
41
+ hash
42
+ end.values
43
+ end
44
+ end
45
+ end
46
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: categorize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -18,10 +18,10 @@ executables: []
18
18
  extensions: []
19
19
  extra_rdoc_files: []
20
20
  files:
21
- - lib/categorize.rb
22
- - lib/constants.rb
23
- - lib/models/bag_of_words.rb
24
- - lib/utils/grams.rb
21
+ - lib/categorize/model.rb
22
+ - lib/categorize/constants.rb
23
+ - lib/categorize/models/bag_of_words.rb
24
+ - lib/categorize/utils/grams.rb
25
25
  homepage: http://www.helioid.com/
26
26
  licenses: []
27
27
  post_install_message:
@@ -42,7 +42,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
42
42
  version: '0'
43
43
  requirements: []
44
44
  rubyforge_project:
45
- rubygems_version: 1.8.25
45
+ rubygems_version: 1.8.24
46
46
  signing_key:
47
47
  specification_version: 3
48
48
  summary: Text categorization library
@@ -1,97 +0,0 @@
1
- require File.join(File.dirname(__FILE__), '..', 'utils', 'grams')
2
-
3
- class BagOfWords
4
- include ::Utils::Grams
5
-
6
- # DEBUG = false
7
- # TODO: some gradient descent to choose this number
8
- # 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
9
- MIN_SUPP_L = 0.07
10
- MIN_SUPP_H = 0.1
11
- NUM_TOP_GRAMS = 250
12
- MAX_BUCKETS = 8
13
-
14
- # function worst case
15
- # O(2 x (#frequent_grams x #gram_collections) + #all_grams + MAX_BUCKETS x #gram_collections)
16
- def model(query, records_to_tokens)
17
- @gram_cover_cache = {}
18
- @gram_collections, @all_grams = create_grams(query, records_to_tokens)
19
-
20
- top_grams = determine_frequency_term_sets(@all_grams, query)
21
- top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
22
- top_grams[gram_c1] <=> top_grams[gram_c2]
23
- end.first(MAX_BUCKETS)
24
-
25
- # below block, worst case O(MAX_BUCKETS x #gram_collections)
26
- @gram_collections.inject({}) do |buckets, gram_collection|
27
- max_fitness = 0
28
- max_fit = nil
29
- top_grams.each do |top_gram|
30
- # the >= removes the 'none' possibility
31
- if gram_collection.fitness[top_gram] && gram_collection.fitness[top_gram] >= max_fitness
32
- max_fitness = gram_collection.fitness[top_gram]
33
- max_fit = top_gram
34
- end
35
- end
36
- buckets[max_fit] ||= []
37
- buckets[max_fit] << gram_collection.content
38
- buckets
39
- end
40
- end
41
-
42
- # ==== Return
43
- # Hash - fitness => [gram_collection, ...]
44
- # function worst case O(2 x (#frequent_grams x #gram_collections) + #all_grams)
45
- def determine_frequency_term_sets(all_grams, query)
46
- # only count a result if it has non-0 words length
47
- effective_length = @gram_collections.reject do |result|
48
- result.grams.nil? || result.grams.empty?
49
- end.length
50
-
51
- min_cover_l = MIN_SUPP_L * effective_length
52
- # min_cover_h = MIN_SUPP_H * effective_length
53
-
54
- # for speed only look at top N grams
55
- # below block, worst case O(#all_grams)
56
- frequent_grams = all_grams.sort do |gram1, gram2|
57
- gram2.frequency <=> gram1.frequency
58
- end.first(NUM_TOP_GRAMS)
59
-
60
- # below block, worst case O(#frequent_grams x #gram_collections)
61
- frequent_grams = frequent_grams.delete_if do |gram|
62
- !cover(gram, min_cover_l)
63
- end
64
-
65
- # below block, worst case O(#frequent_grams x #gram_collections)
66
- @gram_collections.inject(Hash.new(0)) do |top_grams, gram_collection|
67
- max_fitness = 0
68
- max_fit = nil
69
-
70
- frequent_grams.each do |gram|
71
- fitness = gram_collection.fitness[gram.content] = (gram_collection.content_to_frequency[gram.content] || 0) / gram.frequency.to_f
72
- if fitness > max_fitness
73
- max_fitness = fitness
74
- max_fit = gram.content
75
- end
76
- end
77
-
78
- # puts "#{max_fit}: #{max_fitness}"# if DEBUG
79
- top_grams[max_fit] += 1 if max_fit
80
- top_grams
81
- end
82
- end
83
-
84
- # function worstcase O(#gram_collections)
85
- def cover(gram, min_length)
86
- ((cached = @gram_cover_cache[gram]) != nil) and return cached
87
- count = 0
88
- @gram_collections.each do |gram_collection|
89
- frequency = gram_collection.content_to_frequency[gram.content]
90
- if !frequency.nil? && frequency > 0
91
- count += 1
92
- return @gram_cover_cache[gram] = true if count >= min_length
93
- end
94
- end
95
- @gram_cover_cache[gram] = false
96
- end
97
- end
data/lib/utils/grams.rb DELETED
@@ -1,45 +0,0 @@
1
- require File.join(File.dirname(__FILE__), 'gram_collection')
2
- require File.join(File.dirname(__FILE__), 'gram_node')
3
-
4
- module Utils
5
- module Grams
6
- def create_grams(query, records_to_words)
7
- all_grams = []
8
- @query = query
9
- @query_terms = query.split.map(&:downcase).map(&:strip)
10
- @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
11
-
12
- invalid = Proc.new do |gram, *args|
13
- # remove [[gram]] if == [[query]]
14
- gram == @query || gram == @query_alt || @query_terms.include?(gram)
15
- end
16
-
17
- gram_collections = records_to_words.map do |record, words|
18
- gram_collection = GramCollection.new(record, words, invalid)
19
- all_grams += gram_collection.grams
20
- gram_collection
21
- end
22
- return gram_collections, make_grams_unique(all_grams)
23
- end
24
-
25
- def check_plurals(frequent_grams)
26
- # if exists [[gram]] and [[gram]]s then remove [[gram]]s
27
- frequent_grams_contents = frequent_grams.map(&:content)
28
- frequent_grams.delete_if do |gram|
29
- gram.content[-1] == 's' and
30
- frequent_grams_contents.include?(gram.content[0...-1])
31
- end
32
- end
33
-
34
- def make_grams_unique(grams)
35
- grams.inject({}) do |hash, gram|
36
- if hash[gram.content]
37
- hash[gram.content].frequency += gram.frequency
38
- else
39
- hash[gram.content] = gram
40
- end
41
- hash
42
- end.values
43
- end
44
- end
45
- end