categorize 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  module Constants
2
4
  module Words
3
5
  # only include words > 2 chars
@@ -1,9 +1,8 @@
1
- require File.join(File.dirname(__FILE__), 'models', 'bag_of_words')
2
- require File.join(File.dirname(__FILE__), 'constants')
1
+ # encoding: utf-8
3
2
 
4
3
  module Categorize
5
4
  MIN_WORD_LENGTH = 3
6
- @bag_of_words = BagOfWords.new
5
+ @bag_of_words = Models::BagOfWords.new
7
6
 
8
7
  class << self
9
8
  #include Bow
@@ -24,7 +23,7 @@ module Categorize
24
23
  strings.map { |s| preprocess(s) }
25
24
  #ret = model_bow(array_of_tokens);
26
25
  count = 0
27
- ret.inject({}) do |hash, term|
26
+ ret.reduce({}) do |hash, term|
28
27
  hash[term] ||= []
29
28
  hash[term] << count += 1
30
29
  hash
@@ -39,8 +38,9 @@ module Categorize
39
38
  end
40
39
 
41
40
  def preprocess(string)
42
- string.split(Constants::Words::SPLIT_REGEX).map(&:downcase).delete_if do
43
- |word|
41
+ split_lower_strings = string.split(
42
+ Constants::Words::SPLIT_REGEX).map(&:downcase)
43
+ split_lower_strings.delete_if do |word|
44
44
  word.length < MIN_WORD_LENGTH ||
45
45
  Constants::Words::COMMON.include?(word)
46
46
  end
@@ -0,0 +1,112 @@
1
+ # encoding: utf-8
2
+
3
+ require 'categorize/utils/grams'
4
+
5
+ module Categorize
6
+ module Models
7
+ class BagOfWords
8
+ include Utils::Grams
9
+
10
+ # DEBUG = false
11
+ # TODO: some gradient descent to choose this number
12
+ # 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
13
+ MIN_SUPP_L = 0.07
14
+ MIN_SUPP_H = 0.1
15
+ NUM_TOP_GRAMS = 250
16
+ MAX_BUCKETS = 8
17
+
18
+ # function worst case
19
+ # O(2 x (|frequent_grams| x |gram_collections|) +
20
+ # |all_grams| + MAX_BUCKETS x |gram_collections|)
21
+ def model(query, records_to_tokens)
22
+ @gram_cover_cache = {}
23
+ @gram_collections, @all_grams = create_grams(query, records_to_tokens)
24
+
25
+ top_grams = determine_frequency_term_sets(@all_grams, query)
26
+ top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
27
+ top_grams[gram_c1] <=> top_grams[gram_c2]
28
+ end.first(MAX_BUCKETS)
29
+
30
+ # below block, worst case O(MAX_BUCKETS x |gram_collections|)
31
+ @gram_collections.reduce({}) do |buckets, gram_collection|
32
+ max_fitness = 0
33
+ max_fit = nil
34
+ top_grams.each do |top_gram|
35
+ # the >= removes the 'none' possibility
36
+ if gram_collection.fitness[top_gram] &&
37
+ gram_collection.fitness[top_gram] >= max_fitness
38
+ max_fitness = gram_collection.fitness[top_gram]
39
+ max_fit = top_gram
40
+ end
41
+ end
42
+ buckets[max_fit] ||= []
43
+ buckets[max_fit] << gram_collection.content
44
+ buckets
45
+ end
46
+ end
47
+
48
+ # ==== Return
49
+ # Hash - fitness => [gram_collection, ...]
50
+ # function worst case O(2 x (|frequent_grams| x |gram_collections|) +
51
+ # |all_grams|)
52
+ def determine_frequency_term_sets(all_grams, query)
53
+ # only count a result if it has non-0 words length
54
+ effective_length = @gram_collections.reject do |result|
55
+ result.grams.nil? || result.grams.empty?
56
+ end.length
57
+
58
+ min_cover_l = MIN_SUPP_L * effective_length
59
+ # min_cover_h = MIN_SUPP_H * effective_length
60
+
61
+ # for speed only look at top N grams
62
+ # below block, worst case O(|all_grams|)
63
+ frequent_grams = all_grams.sort do |gram1, gram2|
64
+ gram2.frequency <=> gram1.frequency
65
+ end.first(NUM_TOP_GRAMS)
66
+
67
+ # below block, worst case O(|frequent_grams| x |gram_collections|)
68
+ frequent_grams = frequent_grams.delete_if do |gram|
69
+ !cover(gram, min_cover_l)
70
+ end
71
+
72
+ # below block, worst case O(|frequent_grams| x |gram_collections|)
73
+ @gram_collections.reduce(Hash.new(0)) do |top_grams, gram_collection|
74
+ max_fitness = 0
75
+ max_fit = nil
76
+
77
+ frequent_grams.each do |gram|
78
+ content_frequency = (
79
+ gram_collection.content_to_frequency[gram.content] || 0)
80
+ fitness = content_frequency / gram.frequency.to_f
81
+ gram_collection.fitness[gram.content] = fitness
82
+
83
+ if fitness > max_fitness
84
+ max_fitness = fitness
85
+ max_fit = gram.content
86
+ end
87
+ end
88
+
89
+ # puts "#{max_fit}: #{max_fitness}"# if DEBUG
90
+ top_grams[max_fit] += 1 if max_fit
91
+ top_grams
92
+ end
93
+ end
94
+
95
+ # function worstcase O(#gram_collections)
96
+ def cover(gram, min_length)
97
+ ((cached = @gram_cover_cache[gram]) != nil) and return cached
98
+ count = 0
99
+
100
+ @gram_collections.each do |gram_collection|
101
+ frequency = gram_collection.content_to_frequency[gram.content]
102
+ if !frequency.nil? && frequency > 0
103
+ count += 1
104
+ return @gram_cover_cache[gram] = true if count >= min_length
105
+ end
106
+ end
107
+
108
+ @gram_cover_cache[gram] = false
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,46 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Utils
5
+ module Grams
6
+ def create_grams(query, records_to_words)
7
+ all_grams = []
8
+ @query = query
9
+ @query_terms = query.split.map(&:downcase).map(&:strip)
10
+ @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
11
+
12
+ invalid = Proc.new do |gram, *args|
13
+ # remove [[gram]] if == [[query]]
14
+ gram == @query || gram == @query_alt || @query_terms.include?(gram)
15
+ end
16
+
17
+ gram_collections = records_to_words.map do |record, words|
18
+ gram_collection = GramCollection.new(record, words, invalid)
19
+ all_grams += gram_collection.grams
20
+ gram_collection
21
+ end
22
+ return gram_collections, make_grams_unique(all_grams)
23
+ end
24
+
25
+ def check_plurals(frequent_grams)
26
+ # if exists [[gram]] and [[gram]]s then remove [[gram]]s
27
+ frequent_grams_contents = frequent_grams.map(&:content)
28
+ frequent_grams.delete_if do |gram|
29
+ gram.content[-1] == 's' and
30
+ frequent_grams_contents.include?(gram.content[0...-1])
31
+ end
32
+ end
33
+
34
+ def make_grams_unique(grams)
35
+ grams.reduce({}) do |hash, gram|
36
+ if hash[gram.content]
37
+ hash[gram.content].frequency += gram.frequency
38
+ else
39
+ hash[gram.content] = gram
40
+ end
41
+ hash
42
+ end.values
43
+ end
44
+ end
45
+ end
46
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: categorize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -18,10 +18,10 @@ executables: []
18
18
  extensions: []
19
19
  extra_rdoc_files: []
20
20
  files:
21
- - lib/categorize.rb
22
- - lib/constants.rb
23
- - lib/models/bag_of_words.rb
24
- - lib/utils/grams.rb
21
+ - lib/categorize/model.rb
22
+ - lib/categorize/constants.rb
23
+ - lib/categorize/models/bag_of_words.rb
24
+ - lib/categorize/utils/grams.rb
25
25
  homepage: http://www.helioid.com/
26
26
  licenses: []
27
27
  post_install_message:
@@ -42,7 +42,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
42
42
  version: '0'
43
43
  requirements: []
44
44
  rubyforge_project:
45
- rubygems_version: 1.8.25
45
+ rubygems_version: 1.8.24
46
46
  signing_key:
47
47
  specification_version: 3
48
48
  summary: Text categorization library
@@ -1,97 +0,0 @@
1
- require File.join(File.dirname(__FILE__), '..', 'utils', 'grams')
2
-
3
- class BagOfWords
4
- include ::Utils::Grams
5
-
6
- # DEBUG = false
7
- # TODO: some gradient descent to choose this number
8
- # 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
9
- MIN_SUPP_L = 0.07
10
- MIN_SUPP_H = 0.1
11
- NUM_TOP_GRAMS = 250
12
- MAX_BUCKETS = 8
13
-
14
- # function worst case
15
- # O(2 x (#frequent_grams x #gram_collections) + #all_grams + MAX_BUCKETS x #gram_collections)
16
- def model(query, records_to_tokens)
17
- @gram_cover_cache = {}
18
- @gram_collections, @all_grams = create_grams(query, records_to_tokens)
19
-
20
- top_grams = determine_frequency_term_sets(@all_grams, query)
21
- top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
22
- top_grams[gram_c1] <=> top_grams[gram_c2]
23
- end.first(MAX_BUCKETS)
24
-
25
- # below block, worst case O(MAX_BUCKETS x #gram_collections)
26
- @gram_collections.inject({}) do |buckets, gram_collection|
27
- max_fitness = 0
28
- max_fit = nil
29
- top_grams.each do |top_gram|
30
- # the >= removes the 'none' possibility
31
- if gram_collection.fitness[top_gram] && gram_collection.fitness[top_gram] >= max_fitness
32
- max_fitness = gram_collection.fitness[top_gram]
33
- max_fit = top_gram
34
- end
35
- end
36
- buckets[max_fit] ||= []
37
- buckets[max_fit] << gram_collection.content
38
- buckets
39
- end
40
- end
41
-
42
- # ==== Return
43
- # Hash - fitness => [gram_collection, ...]
44
- # function worst case O(2 x (#frequent_grams x #gram_collections) + #all_grams)
45
- def determine_frequency_term_sets(all_grams, query)
46
- # only count a result if it has non-0 words length
47
- effective_length = @gram_collections.reject do |result|
48
- result.grams.nil? || result.grams.empty?
49
- end.length
50
-
51
- min_cover_l = MIN_SUPP_L * effective_length
52
- # min_cover_h = MIN_SUPP_H * effective_length
53
-
54
- # for speed only look at top N grams
55
- # below block, worst case O(#all_grams)
56
- frequent_grams = all_grams.sort do |gram1, gram2|
57
- gram2.frequency <=> gram1.frequency
58
- end.first(NUM_TOP_GRAMS)
59
-
60
- # below block, worst case O(#frequent_grams x #gram_collections)
61
- frequent_grams = frequent_grams.delete_if do |gram|
62
- !cover(gram, min_cover_l)
63
- end
64
-
65
- # below block, worst case O(#frequent_grams x #gram_collections)
66
- @gram_collections.inject(Hash.new(0)) do |top_grams, gram_collection|
67
- max_fitness = 0
68
- max_fit = nil
69
-
70
- frequent_grams.each do |gram|
71
- fitness = gram_collection.fitness[gram.content] = (gram_collection.content_to_frequency[gram.content] || 0) / gram.frequency.to_f
72
- if fitness > max_fitness
73
- max_fitness = fitness
74
- max_fit = gram.content
75
- end
76
- end
77
-
78
- # puts "#{max_fit}: #{max_fitness}"# if DEBUG
79
- top_grams[max_fit] += 1 if max_fit
80
- top_grams
81
- end
82
- end
83
-
84
- # function worstcase O(#gram_collections)
85
- def cover(gram, min_length)
86
- ((cached = @gram_cover_cache[gram]) != nil) and return cached
87
- count = 0
88
- @gram_collections.each do |gram_collection|
89
- frequency = gram_collection.content_to_frequency[gram.content]
90
- if !frequency.nil? && frequency > 0
91
- count += 1
92
- return @gram_cover_cache[gram] = true if count >= min_length
93
- end
94
- end
95
- @gram_cover_cache[gram] = false
96
- end
97
- end
data/lib/utils/grams.rb DELETED
@@ -1,45 +0,0 @@
1
- require File.join(File.dirname(__FILE__), 'gram_collection')
2
- require File.join(File.dirname(__FILE__), 'gram_node')
3
-
4
- module Utils
5
- module Grams
6
- def create_grams(query, records_to_words)
7
- all_grams = []
8
- @query = query
9
- @query_terms = query.split.map(&:downcase).map(&:strip)
10
- @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
11
-
12
- invalid = Proc.new do |gram, *args|
13
- # remove [[gram]] if == [[query]]
14
- gram == @query || gram == @query_alt || @query_terms.include?(gram)
15
- end
16
-
17
- gram_collections = records_to_words.map do |record, words|
18
- gram_collection = GramCollection.new(record, words, invalid)
19
- all_grams += gram_collection.grams
20
- gram_collection
21
- end
22
- return gram_collections, make_grams_unique(all_grams)
23
- end
24
-
25
- def check_plurals(frequent_grams)
26
- # if exists [[gram]] and [[gram]]s then remove [[gram]]s
27
- frequent_grams_contents = frequent_grams.map(&:content)
28
- frequent_grams.delete_if do |gram|
29
- gram.content[-1] == 's' and
30
- frequent_grams_contents.include?(gram.content[0...-1])
31
- end
32
- end
33
-
34
- def make_grams_unique(grams)
35
- grams.inject({}) do |hash, gram|
36
- if hash[gram.content]
37
- hash[gram.content].frequency += gram.frequency
38
- else
39
- hash[gram.content] = gram
40
- end
41
- hash
42
- end.values
43
- end
44
- end
45
- end