categorize 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
@@ -1,9 +1,8 @@
|
|
1
|
-
|
2
|
-
require File.join(File.dirname(__FILE__), 'constants')
|
1
|
+
# encoding: utf-8
|
3
2
|
|
4
3
|
module Categorize
|
5
4
|
MIN_WORD_LENGTH = 3
|
6
|
-
@bag_of_words = BagOfWords.new
|
5
|
+
@bag_of_words = Models::BagOfWords.new
|
7
6
|
|
8
7
|
class << self
|
9
8
|
#include Bow
|
@@ -24,7 +23,7 @@ module Categorize
|
|
24
23
|
strings.map { |s| preprocess(s) }
|
25
24
|
#ret = model_bow(array_of_tokens);
|
26
25
|
count = 0
|
27
|
-
ret.
|
26
|
+
ret.reduce({}) do |hash, term|
|
28
27
|
hash[term] ||= []
|
29
28
|
hash[term] << count += 1
|
30
29
|
hash
|
@@ -39,8 +38,9 @@ module Categorize
|
|
39
38
|
end
|
40
39
|
|
41
40
|
def preprocess(string)
|
42
|
-
string.split(
|
43
|
-
|
41
|
+
split_lower_strings = string.split(
|
42
|
+
Constants::Words::SPLIT_REGEX).map(&:downcase)
|
43
|
+
split_lower_strings.delete_if do |word|
|
44
44
|
word.length < MIN_WORD_LENGTH ||
|
45
45
|
Constants::Words::COMMON.include?(word)
|
46
46
|
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'categorize/utils/grams'
|
4
|
+
|
5
|
+
module Categorize
|
6
|
+
module Models
|
7
|
+
class BagOfWords
|
8
|
+
include Utils::Grams
|
9
|
+
|
10
|
+
# DEBUG = false
|
11
|
+
# TODO: some gradient descent to choose this number
|
12
|
+
# 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
|
13
|
+
MIN_SUPP_L = 0.07
|
14
|
+
MIN_SUPP_H = 0.1
|
15
|
+
NUM_TOP_GRAMS = 250
|
16
|
+
MAX_BUCKETS = 8
|
17
|
+
|
18
|
+
# function worst case
|
19
|
+
# O(2 x (|frequent_grams| x |gram_collections|) +
|
20
|
+
# |all_grams| + MAX_BUCKETS x |gram_collections|)
|
21
|
+
def model(query, records_to_tokens)
|
22
|
+
@gram_cover_cache = {}
|
23
|
+
@gram_collections, @all_grams = create_grams(query, records_to_tokens)
|
24
|
+
|
25
|
+
top_grams = determine_frequency_term_sets(@all_grams, query)
|
26
|
+
top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
|
27
|
+
top_grams[gram_c1] <=> top_grams[gram_c2]
|
28
|
+
end.first(MAX_BUCKETS)
|
29
|
+
|
30
|
+
# below block, worst case O(MAX_BUCKETS x |gram_collections|)
|
31
|
+
@gram_collections.reduce({}) do |buckets, gram_collection|
|
32
|
+
max_fitness = 0
|
33
|
+
max_fit = nil
|
34
|
+
top_grams.each do |top_gram|
|
35
|
+
# the >= removes the 'none' possibility
|
36
|
+
if gram_collection.fitness[top_gram] &&
|
37
|
+
gram_collection.fitness[top_gram] >= max_fitness
|
38
|
+
max_fitness = gram_collection.fitness[top_gram]
|
39
|
+
max_fit = top_gram
|
40
|
+
end
|
41
|
+
end
|
42
|
+
buckets[max_fit] ||= []
|
43
|
+
buckets[max_fit] << gram_collection.content
|
44
|
+
buckets
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# ==== Return
|
49
|
+
# Hash - fitness => [gram_collection, ...]
|
50
|
+
# function worst case O(2 x (|frequent_grams| x |gram_collections|) +
|
51
|
+
# |all_grams|)
|
52
|
+
def determine_frequency_term_sets(all_grams, query)
|
53
|
+
# only count a result if it has non-0 words length
|
54
|
+
effective_length = @gram_collections.reject do |result|
|
55
|
+
result.grams.nil? || result.grams.empty?
|
56
|
+
end.length
|
57
|
+
|
58
|
+
min_cover_l = MIN_SUPP_L * effective_length
|
59
|
+
# min_cover_h = MIN_SUPP_H * effective_length
|
60
|
+
|
61
|
+
# for speed only look at top N grams
|
62
|
+
# below block, worst case O(|all_grams|)
|
63
|
+
frequent_grams = all_grams.sort do |gram1, gram2|
|
64
|
+
gram2.frequency <=> gram1.frequency
|
65
|
+
end.first(NUM_TOP_GRAMS)
|
66
|
+
|
67
|
+
# below block, worst case O(|frequent_grams| x |gram_collections|)
|
68
|
+
frequent_grams = frequent_grams.delete_if do |gram|
|
69
|
+
!cover(gram, min_cover_l)
|
70
|
+
end
|
71
|
+
|
72
|
+
# below block, worst case O(|frequent_grams| x |gram_collections|)
|
73
|
+
@gram_collections.reduce(Hash.new(0)) do |top_grams, gram_collection|
|
74
|
+
max_fitness = 0
|
75
|
+
max_fit = nil
|
76
|
+
|
77
|
+
frequent_grams.each do |gram|
|
78
|
+
content_frequency = (
|
79
|
+
gram_collection.content_to_frequency[gram.content] || 0)
|
80
|
+
fitness = content_frequency / gram.frequency.to_f
|
81
|
+
gram_collection.fitness[gram.content] = fitness
|
82
|
+
|
83
|
+
if fitness > max_fitness
|
84
|
+
max_fitness = fitness
|
85
|
+
max_fit = gram.content
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# puts "#{max_fit}: #{max_fitness}"# if DEBUG
|
90
|
+
top_grams[max_fit] += 1 if max_fit
|
91
|
+
top_grams
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# function worstcase O(#gram_collections)
|
96
|
+
def cover(gram, min_length)
|
97
|
+
((cached = @gram_cover_cache[gram]) != nil) and return cached
|
98
|
+
count = 0
|
99
|
+
|
100
|
+
@gram_collections.each do |gram_collection|
|
101
|
+
frequency = gram_collection.content_to_frequency[gram.content]
|
102
|
+
if !frequency.nil? && frequency > 0
|
103
|
+
count += 1
|
104
|
+
return @gram_cover_cache[gram] = true if count >= min_length
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
@gram_cover_cache[gram] = false
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Utils
|
5
|
+
module Grams
|
6
|
+
def create_grams(query, records_to_words)
|
7
|
+
all_grams = []
|
8
|
+
@query = query
|
9
|
+
@query_terms = query.split.map(&:downcase).map(&:strip)
|
10
|
+
@query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
|
11
|
+
|
12
|
+
invalid = Proc.new do |gram, *args|
|
13
|
+
# remove [[gram]] if == [[query]]
|
14
|
+
gram == @query || gram == @query_alt || @query_terms.include?(gram)
|
15
|
+
end
|
16
|
+
|
17
|
+
gram_collections = records_to_words.map do |record, words|
|
18
|
+
gram_collection = GramCollection.new(record, words, invalid)
|
19
|
+
all_grams += gram_collection.grams
|
20
|
+
gram_collection
|
21
|
+
end
|
22
|
+
return gram_collections, make_grams_unique(all_grams)
|
23
|
+
end
|
24
|
+
|
25
|
+
def check_plurals(frequent_grams)
|
26
|
+
# if exists [[gram]] and [[gram]]s then remove [[gram]]s
|
27
|
+
frequent_grams_contents = frequent_grams.map(&:content)
|
28
|
+
frequent_grams.delete_if do |gram|
|
29
|
+
gram.content[-1] == 's' and
|
30
|
+
frequent_grams_contents.include?(gram.content[0...-1])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def make_grams_unique(grams)
|
35
|
+
grams.reduce({}) do |hash, gram|
|
36
|
+
if hash[gram.content]
|
37
|
+
hash[gram.content].frequency += gram.frequency
|
38
|
+
else
|
39
|
+
hash[gram.content] = gram
|
40
|
+
end
|
41
|
+
hash
|
42
|
+
end.values
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: categorize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -18,10 +18,10 @@ executables: []
|
|
18
18
|
extensions: []
|
19
19
|
extra_rdoc_files: []
|
20
20
|
files:
|
21
|
-
- lib/categorize.rb
|
22
|
-
- lib/constants.rb
|
23
|
-
- lib/models/bag_of_words.rb
|
24
|
-
- lib/utils/grams.rb
|
21
|
+
- lib/categorize/model.rb
|
22
|
+
- lib/categorize/constants.rb
|
23
|
+
- lib/categorize/models/bag_of_words.rb
|
24
|
+
- lib/categorize/utils/grams.rb
|
25
25
|
homepage: http://www.helioid.com/
|
26
26
|
licenses: []
|
27
27
|
post_install_message:
|
@@ -42,7 +42,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
42
42
|
version: '0'
|
43
43
|
requirements: []
|
44
44
|
rubyforge_project:
|
45
|
-
rubygems_version: 1.8.
|
45
|
+
rubygems_version: 1.8.24
|
46
46
|
signing_key:
|
47
47
|
specification_version: 3
|
48
48
|
summary: Text categorization library
|
data/lib/models/bag_of_words.rb
DELETED
@@ -1,97 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', 'utils', 'grams')
|
2
|
-
|
3
|
-
class BagOfWords
|
4
|
-
include ::Utils::Grams
|
5
|
-
|
6
|
-
# DEBUG = false
|
7
|
-
# TODO: some gradient descent to choose this number
|
8
|
-
# 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
|
9
|
-
MIN_SUPP_L = 0.07
|
10
|
-
MIN_SUPP_H = 0.1
|
11
|
-
NUM_TOP_GRAMS = 250
|
12
|
-
MAX_BUCKETS = 8
|
13
|
-
|
14
|
-
# function worst case
|
15
|
-
# O(2 x (#frequent_grams x #gram_collections) + #all_grams + MAX_BUCKETS x #gram_collections)
|
16
|
-
def model(query, records_to_tokens)
|
17
|
-
@gram_cover_cache = {}
|
18
|
-
@gram_collections, @all_grams = create_grams(query, records_to_tokens)
|
19
|
-
|
20
|
-
top_grams = determine_frequency_term_sets(@all_grams, query)
|
21
|
-
top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
|
22
|
-
top_grams[gram_c1] <=> top_grams[gram_c2]
|
23
|
-
end.first(MAX_BUCKETS)
|
24
|
-
|
25
|
-
# below block, worst case O(MAX_BUCKETS x #gram_collections)
|
26
|
-
@gram_collections.inject({}) do |buckets, gram_collection|
|
27
|
-
max_fitness = 0
|
28
|
-
max_fit = nil
|
29
|
-
top_grams.each do |top_gram|
|
30
|
-
# the >= removes the 'none' possibility
|
31
|
-
if gram_collection.fitness[top_gram] && gram_collection.fitness[top_gram] >= max_fitness
|
32
|
-
max_fitness = gram_collection.fitness[top_gram]
|
33
|
-
max_fit = top_gram
|
34
|
-
end
|
35
|
-
end
|
36
|
-
buckets[max_fit] ||= []
|
37
|
-
buckets[max_fit] << gram_collection.content
|
38
|
-
buckets
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
# ==== Return
|
43
|
-
# Hash - fitness => [gram_collection, ...]
|
44
|
-
# function worst case O(2 x (#frequent_grams x #gram_collections) + #all_grams)
|
45
|
-
def determine_frequency_term_sets(all_grams, query)
|
46
|
-
# only count a result if it has non-0 words length
|
47
|
-
effective_length = @gram_collections.reject do |result|
|
48
|
-
result.grams.nil? || result.grams.empty?
|
49
|
-
end.length
|
50
|
-
|
51
|
-
min_cover_l = MIN_SUPP_L * effective_length
|
52
|
-
# min_cover_h = MIN_SUPP_H * effective_length
|
53
|
-
|
54
|
-
# for speed only look at top N grams
|
55
|
-
# below block, worst case O(#all_grams)
|
56
|
-
frequent_grams = all_grams.sort do |gram1, gram2|
|
57
|
-
gram2.frequency <=> gram1.frequency
|
58
|
-
end.first(NUM_TOP_GRAMS)
|
59
|
-
|
60
|
-
# below block, worst case O(#frequent_grams x #gram_collections)
|
61
|
-
frequent_grams = frequent_grams.delete_if do |gram|
|
62
|
-
!cover(gram, min_cover_l)
|
63
|
-
end
|
64
|
-
|
65
|
-
# below block, worst case O(#frequent_grams x #gram_collections)
|
66
|
-
@gram_collections.inject(Hash.new(0)) do |top_grams, gram_collection|
|
67
|
-
max_fitness = 0
|
68
|
-
max_fit = nil
|
69
|
-
|
70
|
-
frequent_grams.each do |gram|
|
71
|
-
fitness = gram_collection.fitness[gram.content] = (gram_collection.content_to_frequency[gram.content] || 0) / gram.frequency.to_f
|
72
|
-
if fitness > max_fitness
|
73
|
-
max_fitness = fitness
|
74
|
-
max_fit = gram.content
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
# puts "#{max_fit}: #{max_fitness}"# if DEBUG
|
79
|
-
top_grams[max_fit] += 1 if max_fit
|
80
|
-
top_grams
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
# function worstcase O(#gram_collections)
|
85
|
-
def cover(gram, min_length)
|
86
|
-
((cached = @gram_cover_cache[gram]) != nil) and return cached
|
87
|
-
count = 0
|
88
|
-
@gram_collections.each do |gram_collection|
|
89
|
-
frequency = gram_collection.content_to_frequency[gram.content]
|
90
|
-
if !frequency.nil? && frequency > 0
|
91
|
-
count += 1
|
92
|
-
return @gram_cover_cache[gram] = true if count >= min_length
|
93
|
-
end
|
94
|
-
end
|
95
|
-
@gram_cover_cache[gram] = false
|
96
|
-
end
|
97
|
-
end
|
data/lib/utils/grams.rb
DELETED
@@ -1,45 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), 'gram_collection')
|
2
|
-
require File.join(File.dirname(__FILE__), 'gram_node')
|
3
|
-
|
4
|
-
module Utils
|
5
|
-
module Grams
|
6
|
-
def create_grams(query, records_to_words)
|
7
|
-
all_grams = []
|
8
|
-
@query = query
|
9
|
-
@query_terms = query.split.map(&:downcase).map(&:strip)
|
10
|
-
@query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
|
11
|
-
|
12
|
-
invalid = Proc.new do |gram, *args|
|
13
|
-
# remove [[gram]] if == [[query]]
|
14
|
-
gram == @query || gram == @query_alt || @query_terms.include?(gram)
|
15
|
-
end
|
16
|
-
|
17
|
-
gram_collections = records_to_words.map do |record, words|
|
18
|
-
gram_collection = GramCollection.new(record, words, invalid)
|
19
|
-
all_grams += gram_collection.grams
|
20
|
-
gram_collection
|
21
|
-
end
|
22
|
-
return gram_collections, make_grams_unique(all_grams)
|
23
|
-
end
|
24
|
-
|
25
|
-
def check_plurals(frequent_grams)
|
26
|
-
# if exists [[gram]] and [[gram]]s then remove [[gram]]s
|
27
|
-
frequent_grams_contents = frequent_grams.map(&:content)
|
28
|
-
frequent_grams.delete_if do |gram|
|
29
|
-
gram.content[-1] == 's' and
|
30
|
-
frequent_grams_contents.include?(gram.content[0...-1])
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def make_grams_unique(grams)
|
35
|
-
grams.inject({}) do |hash, gram|
|
36
|
-
if hash[gram.content]
|
37
|
-
hash[gram.content].frequency += gram.frequency
|
38
|
-
else
|
39
|
-
hash[gram.content] = gram
|
40
|
-
end
|
41
|
-
hash
|
42
|
-
end.values
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|