categorize 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/categorize/models/cluster.rb +110 -0
- data/lib/categorize/models/hierarchical_cluster.rb +27 -0
- data/lib/categorize.rb +13 -0
- metadata +8 -5
- data/lib/categorize/utils/grams.rb +0 -46
@@ -0,0 +1,110 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Models
|
5
|
+
class Cluster < AbstractModel
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@num_clusters = 10
|
9
|
+
@clusterer = Ai4r::Clusterers::WardLinkage.new
|
10
|
+
super
|
11
|
+
end
|
12
|
+
|
13
|
+
def model(query, records_to_tokens)
|
14
|
+
@query = query
|
15
|
+
dataset = build_vars(records_to_tokens)
|
16
|
+
@clusterer.build(dataset, @num_clusters)
|
17
|
+
build_categories(@clusterer.clusters)
|
18
|
+
end
|
19
|
+
|
20
|
+
def build_categories(clusters)
|
21
|
+
clusters_to_records = Hash[clusters.each_with_index.map do |cluster, i|
|
22
|
+
[i, cluster.data_items.map { |v| @vectors.index(v) }]
|
23
|
+
end]
|
24
|
+
|
25
|
+
@query_terms ||= @query.split.map(&:downcase)
|
26
|
+
|
27
|
+
categories = clusters_to_records.map do |cluster, records|
|
28
|
+
term_vectors = records.map { |r| @vectors[r] }.transpose
|
29
|
+
tf = term_vectors.map { |f| f.reduce(&:+) }
|
30
|
+
get_bigram_max(records, tf)
|
31
|
+
end
|
32
|
+
|
33
|
+
records = clusters_to_records.values
|
34
|
+
# merge duplicate labeled categories
|
35
|
+
categories_records = []
|
36
|
+
|
37
|
+
categories.each_with_index do |category, i|
|
38
|
+
if j = categories[0...i].index(category) && categories_records[j]
|
39
|
+
categories_records[j].last + records.shift
|
40
|
+
else
|
41
|
+
categories_records << [category, records.shift]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
categories_records
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
def df(term_vectors)
|
50
|
+
term_vectors.map do |f|
|
51
|
+
f.reduce { |count, tf| tf > 0 ? count + 1 : count }
|
52
|
+
end.flatten
|
53
|
+
end
|
54
|
+
|
55
|
+
def get_bigram_max(records, tf, df = false)
|
56
|
+
@bigram_max_cache[[records, tf, df]] ||= bigram_max(records, tf, df)
|
57
|
+
end
|
58
|
+
|
59
|
+
def bigram_max(records, tf, df)
|
60
|
+
bigrams = records.map { |r| get_grams(r) }.flatten.uniq
|
61
|
+
bigrams.max_by do |b|
|
62
|
+
b_terms = b.split
|
63
|
+
if b == @query || b_terms.include?(@query) ||
|
64
|
+
b_terms.any? { |t| @query_terms.include?(t) }
|
65
|
+
0
|
66
|
+
else
|
67
|
+
i, j = b_terms.map { |t| @labels.index(t) }
|
68
|
+
if df
|
69
|
+
df_i, df_j = [i, j].map { |k| df[k] }
|
70
|
+
df_i > 0 and df_j > 0 ? (tf[i] / df_i) * (tf[j] / df_j) : 0
|
71
|
+
else
|
72
|
+
tf[i] * tf[j]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def unigram_max(records, tf, df = false)
|
79
|
+
cluster_terms = records.map { |r| @tokens[r] }.flatten.uniq
|
80
|
+
cluster_terms.max_by do |t|
|
81
|
+
if t == @query || @query_terms.include?(t)
|
82
|
+
0
|
83
|
+
else
|
84
|
+
i = labels.index(t)
|
85
|
+
if df
|
86
|
+
df_i = df[i]
|
87
|
+
df_i > 0 ? tf[i] / df_i : 0
|
88
|
+
else
|
89
|
+
tf[i]
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def get_grams(r)
|
96
|
+
@gram_cache[r] ||= gramize(@tokens[r])
|
97
|
+
end
|
98
|
+
|
99
|
+
def gramize(tokens)
|
100
|
+
last_token = nil
|
101
|
+
tokens = tokens.map do |token|
|
102
|
+
new_token = last_token and last_token != token
|
103
|
+
gram = (new_token) ? "#{last_token} #{token}" : nil
|
104
|
+
last_token = token
|
105
|
+
gram
|
106
|
+
end.compact.uniq
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Models
|
5
|
+
class HierarchicalCluster < Cluster
|
6
|
+
def initialize
|
7
|
+
super
|
8
|
+
@depth = 8
|
9
|
+
@clusterer = Ai4r::Clusterers::WardLinkageHierarchical.new(@depth)
|
10
|
+
end
|
11
|
+
|
12
|
+
def model(query, records_to_tokens)
|
13
|
+
@query = query
|
14
|
+
dataset = build_vars(records_to_tokens)
|
15
|
+
@num_clusters = 1
|
16
|
+
@clusterer.build(dataset, @num_clusters)
|
17
|
+
@num_clusters = 0
|
18
|
+
cluster_sets = nil
|
19
|
+
cluster_sets = @clusterer.cluster_tree.map do |clusters|
|
20
|
+
@num_clusters += 1
|
21
|
+
build_categories(clusters)
|
22
|
+
end
|
23
|
+
cluster_sets
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/categorize.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'categorize/models/abstract_model'
|
4
|
+
require 'categorize/models/bag_of_words'
|
5
|
+
require 'categorize/models/cluster'
|
6
|
+
require 'categorize/models/hierarchical_cluster'
|
7
|
+
|
8
|
+
require 'categorize/utils/gram_collection'
|
9
|
+
require 'categorize/utils/gram_node'
|
10
|
+
require 'categorize/utils/grams'
|
11
|
+
|
12
|
+
require 'categorize/constants'
|
13
|
+
require 'categorize/model'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: categorize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,18 +10,21 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2013-04-14 00:00:00.000000000 Z
|
14
14
|
dependencies: []
|
15
|
-
description:
|
15
|
+
description: ! "A text categorization library that favors performance.\n Built
|
16
|
+
for use in online systems."
|
16
17
|
email: peter@helioid.com
|
17
18
|
executables: []
|
18
19
|
extensions: []
|
19
20
|
extra_rdoc_files: []
|
20
21
|
files:
|
22
|
+
- lib/categorize.rb
|
21
23
|
- lib/categorize/model.rb
|
22
24
|
- lib/categorize/constants.rb
|
23
25
|
- lib/categorize/models/bag_of_words.rb
|
24
|
-
- lib/categorize/
|
26
|
+
- lib/categorize/models/cluster.rb
|
27
|
+
- lib/categorize/models/hierarchical_cluster.rb
|
25
28
|
homepage: http://www.helioid.com/
|
26
29
|
licenses: []
|
27
30
|
post_install_message:
|
@@ -45,5 +48,5 @@ rubyforge_project:
|
|
45
48
|
rubygems_version: 1.8.24
|
46
49
|
signing_key:
|
47
50
|
specification_version: 3
|
48
|
-
summary:
|
51
|
+
summary: A text categorization library.
|
49
52
|
test_files: []
|
@@ -1,46 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Categorize
|
4
|
-
module Utils
|
5
|
-
module Grams
|
6
|
-
def create_grams(query, records_to_words)
|
7
|
-
all_grams = []
|
8
|
-
@query = query
|
9
|
-
@query_terms = query.split.map(&:downcase).map(&:strip)
|
10
|
-
@query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
|
11
|
-
|
12
|
-
invalid = Proc.new do |gram, *args|
|
13
|
-
# remove [[gram]] if == [[query]]
|
14
|
-
gram == @query || gram == @query_alt || @query_terms.include?(gram)
|
15
|
-
end
|
16
|
-
|
17
|
-
gram_collections = records_to_words.map do |record, words|
|
18
|
-
gram_collection = GramCollection.new(record, words, invalid)
|
19
|
-
all_grams += gram_collection.grams
|
20
|
-
gram_collection
|
21
|
-
end
|
22
|
-
return gram_collections, make_grams_unique(all_grams)
|
23
|
-
end
|
24
|
-
|
25
|
-
def check_plurals(frequent_grams)
|
26
|
-
# if exists [[gram]] and [[gram]]s then remove [[gram]]s
|
27
|
-
frequent_grams_contents = frequent_grams.map(&:content)
|
28
|
-
frequent_grams.delete_if do |gram|
|
29
|
-
gram.content[-1] == 's' and
|
30
|
-
frequent_grams_contents.include?(gram.content[0...-1])
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def make_grams_unique(grams)
|
35
|
-
grams.reduce({}) do |hash, gram|
|
36
|
-
if hash[gram.content]
|
37
|
-
hash[gram.content].frequency += gram.frequency
|
38
|
-
else
|
39
|
-
hash[gram.content] = gram
|
40
|
-
end
|
41
|
-
hash
|
42
|
-
end.values
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|