categorize 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,8 @@
|
|
1
|
-
|
2
|
-
require File.join(File.dirname(__FILE__), 'constants')
|
1
|
+
# encoding: utf-8
|
3
2
|
|
4
3
|
module Categorize
|
5
4
|
MIN_WORD_LENGTH = 3
|
6
|
-
@bag_of_words = BagOfWords.new
|
5
|
+
@bag_of_words = Models::BagOfWords.new
|
7
6
|
|
8
7
|
class << self
|
9
8
|
#include Bow
|
@@ -24,7 +23,7 @@ module Categorize
|
|
24
23
|
strings.map { |s| preprocess(s) }
|
25
24
|
#ret = model_bow(array_of_tokens);
|
26
25
|
count = 0
|
27
|
-
ret.
|
26
|
+
ret.reduce({}) do |hash, term|
|
28
27
|
hash[term] ||= []
|
29
28
|
hash[term] << count += 1
|
30
29
|
hash
|
@@ -39,8 +38,9 @@ module Categorize
|
|
39
38
|
end
|
40
39
|
|
41
40
|
def preprocess(string)
|
42
|
-
string.split(
|
43
|
-
|
41
|
+
split_lower_strings = string.split(
|
42
|
+
Constants::Words::SPLIT_REGEX).map(&:downcase)
|
43
|
+
split_lower_strings.delete_if do |word|
|
44
44
|
word.length < MIN_WORD_LENGTH ||
|
45
45
|
Constants::Words::COMMON.include?(word)
|
46
46
|
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'categorize/utils/grams'
|
4
|
+
|
5
|
+
module Categorize
|
6
|
+
module Models
|
7
|
+
class BagOfWords
|
8
|
+
include Utils::Grams
|
9
|
+
|
10
|
+
# DEBUG = false
|
11
|
+
# TODO: some gradient descent to choose this number
|
12
|
+
# 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
|
13
|
+
MIN_SUPP_L = 0.07
|
14
|
+
MIN_SUPP_H = 0.1
|
15
|
+
NUM_TOP_GRAMS = 250
|
16
|
+
MAX_BUCKETS = 8
|
17
|
+
|
18
|
+
# function worst case
|
19
|
+
# O(2 x (|frequent_grams| x |gram_collections|) +
|
20
|
+
# |all_grams| + MAX_BUCKETS x |gram_collections|)
|
21
|
+
def model(query, records_to_tokens)
|
22
|
+
@gram_cover_cache = {}
|
23
|
+
@gram_collections, @all_grams = create_grams(query, records_to_tokens)
|
24
|
+
|
25
|
+
top_grams = determine_frequency_term_sets(@all_grams, query)
|
26
|
+
top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
|
27
|
+
top_grams[gram_c1] <=> top_grams[gram_c2]
|
28
|
+
end.first(MAX_BUCKETS)
|
29
|
+
|
30
|
+
# below block, worst case O(MAX_BUCKETS x |gram_collections|)
|
31
|
+
@gram_collections.reduce({}) do |buckets, gram_collection|
|
32
|
+
max_fitness = 0
|
33
|
+
max_fit = nil
|
34
|
+
top_grams.each do |top_gram|
|
35
|
+
# the >= removes the 'none' possibility
|
36
|
+
if gram_collection.fitness[top_gram] &&
|
37
|
+
gram_collection.fitness[top_gram] >= max_fitness
|
38
|
+
max_fitness = gram_collection.fitness[top_gram]
|
39
|
+
max_fit = top_gram
|
40
|
+
end
|
41
|
+
end
|
42
|
+
buckets[max_fit] ||= []
|
43
|
+
buckets[max_fit] << gram_collection.content
|
44
|
+
buckets
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# ==== Return
|
49
|
+
# Hash - fitness => [gram_collection, ...]
|
50
|
+
# function worst case O(2 x (|frequent_grams| x |gram_collections|) +
|
51
|
+
# |all_grams|)
|
52
|
+
def determine_frequency_term_sets(all_grams, query)
|
53
|
+
# only count a result if it has non-0 words length
|
54
|
+
effective_length = @gram_collections.reject do |result|
|
55
|
+
result.grams.nil? || result.grams.empty?
|
56
|
+
end.length
|
57
|
+
|
58
|
+
min_cover_l = MIN_SUPP_L * effective_length
|
59
|
+
# min_cover_h = MIN_SUPP_H * effective_length
|
60
|
+
|
61
|
+
# for speed only look at top N grams
|
62
|
+
# below block, worst case O(|all_grams|)
|
63
|
+
frequent_grams = all_grams.sort do |gram1, gram2|
|
64
|
+
gram2.frequency <=> gram1.frequency
|
65
|
+
end.first(NUM_TOP_GRAMS)
|
66
|
+
|
67
|
+
# below block, worst case O(|frequent_grams| x |gram_collections|)
|
68
|
+
frequent_grams = frequent_grams.delete_if do |gram|
|
69
|
+
!cover(gram, min_cover_l)
|
70
|
+
end
|
71
|
+
|
72
|
+
# below block, worst case O(|frequent_grams| x |gram_collections|)
|
73
|
+
@gram_collections.reduce(Hash.new(0)) do |top_grams, gram_collection|
|
74
|
+
max_fitness = 0
|
75
|
+
max_fit = nil
|
76
|
+
|
77
|
+
frequent_grams.each do |gram|
|
78
|
+
content_frequency = (
|
79
|
+
gram_collection.content_to_frequency[gram.content] || 0)
|
80
|
+
fitness = content_frequency / gram.frequency.to_f
|
81
|
+
gram_collection.fitness[gram.content] = fitness
|
82
|
+
|
83
|
+
if fitness > max_fitness
|
84
|
+
max_fitness = fitness
|
85
|
+
max_fit = gram.content
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# puts "#{max_fit}: #{max_fitness}"# if DEBUG
|
90
|
+
top_grams[max_fit] += 1 if max_fit
|
91
|
+
top_grams
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# function worstcase O(#gram_collections)
|
96
|
+
def cover(gram, min_length)
|
97
|
+
((cached = @gram_cover_cache[gram]) != nil) and return cached
|
98
|
+
count = 0
|
99
|
+
|
100
|
+
@gram_collections.each do |gram_collection|
|
101
|
+
frequency = gram_collection.content_to_frequency[gram.content]
|
102
|
+
if !frequency.nil? && frequency > 0
|
103
|
+
count += 1
|
104
|
+
return @gram_cover_cache[gram] = true if count >= min_length
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
@gram_cover_cache[gram] = false
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Utils
|
5
|
+
module Grams
|
6
|
+
def create_grams(query, records_to_words)
|
7
|
+
all_grams = []
|
8
|
+
@query = query
|
9
|
+
@query_terms = query.split.map(&:downcase).map(&:strip)
|
10
|
+
@query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
|
11
|
+
|
12
|
+
invalid = Proc.new do |gram, *args|
|
13
|
+
# remove [[gram]] if == [[query]]
|
14
|
+
gram == @query || gram == @query_alt || @query_terms.include?(gram)
|
15
|
+
end
|
16
|
+
|
17
|
+
gram_collections = records_to_words.map do |record, words|
|
18
|
+
gram_collection = GramCollection.new(record, words, invalid)
|
19
|
+
all_grams += gram_collection.grams
|
20
|
+
gram_collection
|
21
|
+
end
|
22
|
+
return gram_collections, make_grams_unique(all_grams)
|
23
|
+
end
|
24
|
+
|
25
|
+
def check_plurals(frequent_grams)
|
26
|
+
# if exists [[gram]] and [[gram]]s then remove [[gram]]s
|
27
|
+
frequent_grams_contents = frequent_grams.map(&:content)
|
28
|
+
frequent_grams.delete_if do |gram|
|
29
|
+
gram.content[-1] == 's' and
|
30
|
+
frequent_grams_contents.include?(gram.content[0...-1])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def make_grams_unique(grams)
|
35
|
+
grams.reduce({}) do |hash, gram|
|
36
|
+
if hash[gram.content]
|
37
|
+
hash[gram.content].frequency += gram.frequency
|
38
|
+
else
|
39
|
+
hash[gram.content] = gram
|
40
|
+
end
|
41
|
+
hash
|
42
|
+
end.values
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: categorize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -18,10 +18,10 @@ executables: []
|
|
18
18
|
extensions: []
|
19
19
|
extra_rdoc_files: []
|
20
20
|
files:
|
21
|
-
- lib/categorize.rb
|
22
|
-
- lib/constants.rb
|
23
|
-
- lib/models/bag_of_words.rb
|
24
|
-
- lib/utils/grams.rb
|
21
|
+
- lib/categorize/model.rb
|
22
|
+
- lib/categorize/constants.rb
|
23
|
+
- lib/categorize/models/bag_of_words.rb
|
24
|
+
- lib/categorize/utils/grams.rb
|
25
25
|
homepage: http://www.helioid.com/
|
26
26
|
licenses: []
|
27
27
|
post_install_message:
|
@@ -42,7 +42,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
42
42
|
version: '0'
|
43
43
|
requirements: []
|
44
44
|
rubyforge_project:
|
45
|
-
rubygems_version: 1.8.
|
45
|
+
rubygems_version: 1.8.24
|
46
46
|
signing_key:
|
47
47
|
specification_version: 3
|
48
48
|
summary: Text categorization library
|
data/lib/models/bag_of_words.rb
DELETED
@@ -1,97 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', 'utils', 'grams')
|
2
|
-
|
3
|
-
class BagOfWords
|
4
|
-
include ::Utils::Grams
|
5
|
-
|
6
|
-
# DEBUG = false
|
7
|
-
# TODO: some gradient descent to choose this number
|
8
|
-
# 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
|
9
|
-
MIN_SUPP_L = 0.07
|
10
|
-
MIN_SUPP_H = 0.1
|
11
|
-
NUM_TOP_GRAMS = 250
|
12
|
-
MAX_BUCKETS = 8
|
13
|
-
|
14
|
-
# function worst case
|
15
|
-
# O(2 x (#frequent_grams x #gram_collections) + #all_grams + MAX_BUCKETS x #gram_collections)
|
16
|
-
def model(query, records_to_tokens)
|
17
|
-
@gram_cover_cache = {}
|
18
|
-
@gram_collections, @all_grams = create_grams(query, records_to_tokens)
|
19
|
-
|
20
|
-
top_grams = determine_frequency_term_sets(@all_grams, query)
|
21
|
-
top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
|
22
|
-
top_grams[gram_c1] <=> top_grams[gram_c2]
|
23
|
-
end.first(MAX_BUCKETS)
|
24
|
-
|
25
|
-
# below block, worst case O(MAX_BUCKETS x #gram_collections)
|
26
|
-
@gram_collections.inject({}) do |buckets, gram_collection|
|
27
|
-
max_fitness = 0
|
28
|
-
max_fit = nil
|
29
|
-
top_grams.each do |top_gram|
|
30
|
-
# the >= removes the 'none' possibility
|
31
|
-
if gram_collection.fitness[top_gram] && gram_collection.fitness[top_gram] >= max_fitness
|
32
|
-
max_fitness = gram_collection.fitness[top_gram]
|
33
|
-
max_fit = top_gram
|
34
|
-
end
|
35
|
-
end
|
36
|
-
buckets[max_fit] ||= []
|
37
|
-
buckets[max_fit] << gram_collection.content
|
38
|
-
buckets
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
# ==== Return
|
43
|
-
# Hash - fitness => [gram_collection, ...]
|
44
|
-
# function worst case O(2 x (#frequent_grams x #gram_collections) + #all_grams)
|
45
|
-
def determine_frequency_term_sets(all_grams, query)
|
46
|
-
# only count a result if it has non-0 words length
|
47
|
-
effective_length = @gram_collections.reject do |result|
|
48
|
-
result.grams.nil? || result.grams.empty?
|
49
|
-
end.length
|
50
|
-
|
51
|
-
min_cover_l = MIN_SUPP_L * effective_length
|
52
|
-
# min_cover_h = MIN_SUPP_H * effective_length
|
53
|
-
|
54
|
-
# for speed only look at top N grams
|
55
|
-
# below block, worst case O(#all_grams)
|
56
|
-
frequent_grams = all_grams.sort do |gram1, gram2|
|
57
|
-
gram2.frequency <=> gram1.frequency
|
58
|
-
end.first(NUM_TOP_GRAMS)
|
59
|
-
|
60
|
-
# below block, worst case O(#frequent_grams x #gram_collections)
|
61
|
-
frequent_grams = frequent_grams.delete_if do |gram|
|
62
|
-
!cover(gram, min_cover_l)
|
63
|
-
end
|
64
|
-
|
65
|
-
# below block, worst case O(#frequent_grams x #gram_collections)
|
66
|
-
@gram_collections.inject(Hash.new(0)) do |top_grams, gram_collection|
|
67
|
-
max_fitness = 0
|
68
|
-
max_fit = nil
|
69
|
-
|
70
|
-
frequent_grams.each do |gram|
|
71
|
-
fitness = gram_collection.fitness[gram.content] = (gram_collection.content_to_frequency[gram.content] || 0) / gram.frequency.to_f
|
72
|
-
if fitness > max_fitness
|
73
|
-
max_fitness = fitness
|
74
|
-
max_fit = gram.content
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
# puts "#{max_fit}: #{max_fitness}"# if DEBUG
|
79
|
-
top_grams[max_fit] += 1 if max_fit
|
80
|
-
top_grams
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
# function worstcase O(#gram_collections)
|
85
|
-
def cover(gram, min_length)
|
86
|
-
((cached = @gram_cover_cache[gram]) != nil) and return cached
|
87
|
-
count = 0
|
88
|
-
@gram_collections.each do |gram_collection|
|
89
|
-
frequency = gram_collection.content_to_frequency[gram.content]
|
90
|
-
if !frequency.nil? && frequency > 0
|
91
|
-
count += 1
|
92
|
-
return @gram_cover_cache[gram] = true if count >= min_length
|
93
|
-
end
|
94
|
-
end
|
95
|
-
@gram_cover_cache[gram] = false
|
96
|
-
end
|
97
|
-
end
|
data/lib/utils/grams.rb
DELETED
@@ -1,45 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), 'gram_collection')
|
2
|
-
require File.join(File.dirname(__FILE__), 'gram_node')
|
3
|
-
|
4
|
-
module Utils
|
5
|
-
module Grams
|
6
|
-
def create_grams(query, records_to_words)
|
7
|
-
all_grams = []
|
8
|
-
@query = query
|
9
|
-
@query_terms = query.split.map(&:downcase).map(&:strip)
|
10
|
-
@query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
|
11
|
-
|
12
|
-
invalid = Proc.new do |gram, *args|
|
13
|
-
# remove [[gram]] if == [[query]]
|
14
|
-
gram == @query || gram == @query_alt || @query_terms.include?(gram)
|
15
|
-
end
|
16
|
-
|
17
|
-
gram_collections = records_to_words.map do |record, words|
|
18
|
-
gram_collection = GramCollection.new(record, words, invalid)
|
19
|
-
all_grams += gram_collection.grams
|
20
|
-
gram_collection
|
21
|
-
end
|
22
|
-
return gram_collections, make_grams_unique(all_grams)
|
23
|
-
end
|
24
|
-
|
25
|
-
def check_plurals(frequent_grams)
|
26
|
-
# if exists [[gram]] and [[gram]]s then remove [[gram]]s
|
27
|
-
frequent_grams_contents = frequent_grams.map(&:content)
|
28
|
-
frequent_grams.delete_if do |gram|
|
29
|
-
gram.content[-1] == 's' and
|
30
|
-
frequent_grams_contents.include?(gram.content[0...-1])
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def make_grams_unique(grams)
|
35
|
-
grams.inject({}) do |hash, gram|
|
36
|
-
if hash[gram.content]
|
37
|
-
hash[gram.content].frequency += gram.frequency
|
38
|
-
else
|
39
|
-
hash[gram.content] = gram
|
40
|
-
end
|
41
|
-
hash
|
42
|
-
end.values
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|