categorize 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/categorize/models/cluster.rb +110 -0
- data/lib/categorize/models/hierarchical_cluster.rb +27 -0
- data/lib/categorize.rb +13 -0
- metadata +8 -5
- data/lib/categorize/utils/grams.rb +0 -46
@@ -0,0 +1,110 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Models
|
5
|
+
class Cluster < AbstractModel
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@num_clusters = 10
|
9
|
+
@clusterer = Ai4r::Clusterers::WardLinkage.new
|
10
|
+
super
|
11
|
+
end
|
12
|
+
|
13
|
+
def model(query, records_to_tokens)
|
14
|
+
@query = query
|
15
|
+
dataset = build_vars(records_to_tokens)
|
16
|
+
@clusterer.build(dataset, @num_clusters)
|
17
|
+
build_categories(@clusterer.clusters)
|
18
|
+
end
|
19
|
+
|
20
|
+
def build_categories(clusters)
|
21
|
+
clusters_to_records = Hash[clusters.each_with_index.map do |cluster, i|
|
22
|
+
[i, cluster.data_items.map { |v| @vectors.index(v) }]
|
23
|
+
end]
|
24
|
+
|
25
|
+
@query_terms ||= @query.split.map(&:downcase)
|
26
|
+
|
27
|
+
categories = clusters_to_records.map do |cluster, records|
|
28
|
+
term_vectors = records.map { |r| @vectors[r] }.transpose
|
29
|
+
tf = term_vectors.map { |f| f.reduce(&:+) }
|
30
|
+
get_bigram_max(records, tf)
|
31
|
+
end
|
32
|
+
|
33
|
+
records = clusters_to_records.values
|
34
|
+
# merge duplicate labeled categories
|
35
|
+
categories_records = []
|
36
|
+
|
37
|
+
categories.each_with_index do |category, i|
|
38
|
+
if j = categories[0...i].index(category) && categories_records[j]
|
39
|
+
categories_records[j].last + records.shift
|
40
|
+
else
|
41
|
+
categories_records << [category, records.shift]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
categories_records
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
def df(term_vectors)
|
50
|
+
term_vectors.map do |f|
|
51
|
+
f.reduce { |count, tf| tf > 0 ? count + 1 : count }
|
52
|
+
end.flatten
|
53
|
+
end
|
54
|
+
|
55
|
+
def get_bigram_max(records, tf, df = false)
|
56
|
+
@bigram_max_cache[[records, tf, df]] ||= bigram_max(records, tf, df)
|
57
|
+
end
|
58
|
+
|
59
|
+
def bigram_max(records, tf, df)
|
60
|
+
bigrams = records.map { |r| get_grams(r) }.flatten.uniq
|
61
|
+
bigrams.max_by do |b|
|
62
|
+
b_terms = b.split
|
63
|
+
if b == @query || b_terms.include?(@query) ||
|
64
|
+
b_terms.any? { |t| @query_terms.include?(t) }
|
65
|
+
0
|
66
|
+
else
|
67
|
+
i, j = b_terms.map { |t| @labels.index(t) }
|
68
|
+
if df
|
69
|
+
df_i, df_j = [i, j].map { |k| df[k] }
|
70
|
+
df_i > 0 and df_j > 0 ? (tf[i] / df_i) * (tf[j] / df_j) : 0
|
71
|
+
else
|
72
|
+
tf[i] * tf[j]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def unigram_max(records, tf, df = false)
|
79
|
+
cluster_terms = records.map { |r| @tokens[r] }.flatten.uniq
|
80
|
+
cluster_terms.max_by do |t|
|
81
|
+
if t == @query || @query_terms.include?(t)
|
82
|
+
0
|
83
|
+
else
|
84
|
+
i = labels.index(t)
|
85
|
+
if df
|
86
|
+
df_i = df[i]
|
87
|
+
df_i > 0 ? tf[i] / df_i : 0
|
88
|
+
else
|
89
|
+
tf[i]
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def get_grams(r)
|
96
|
+
@gram_cache[r] ||= gramize(@tokens[r])
|
97
|
+
end
|
98
|
+
|
99
|
+
def gramize(tokens)
|
100
|
+
last_token = nil
|
101
|
+
tokens = tokens.map do |token|
|
102
|
+
new_token = last_token and last_token != token
|
103
|
+
gram = (new_token) ? "#{last_token} #{token}" : nil
|
104
|
+
last_token = token
|
105
|
+
gram
|
106
|
+
end.compact.uniq
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Categorize
|
4
|
+
module Models
|
5
|
+
class HierarchicalCluster < Cluster
|
6
|
+
def initialize
|
7
|
+
super
|
8
|
+
@depth = 8
|
9
|
+
@clusterer = Ai4r::Clusterers::WardLinkageHierarchical.new(@depth)
|
10
|
+
end
|
11
|
+
|
12
|
+
def model(query, records_to_tokens)
|
13
|
+
@query = query
|
14
|
+
dataset = build_vars(records_to_tokens)
|
15
|
+
@num_clusters = 1
|
16
|
+
@clusterer.build(dataset, @num_clusters)
|
17
|
+
@num_clusters = 0
|
18
|
+
cluster_sets = nil
|
19
|
+
cluster_sets = @clusterer.cluster_tree.map do |clusters|
|
20
|
+
@num_clusters += 1
|
21
|
+
build_categories(clusters)
|
22
|
+
end
|
23
|
+
cluster_sets
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/categorize.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'categorize/models/abstract_model'
|
4
|
+
require 'categorize/models/bag_of_words'
|
5
|
+
require 'categorize/models/cluster'
|
6
|
+
require 'categorize/models/hierarchical_cluster'
|
7
|
+
|
8
|
+
require 'categorize/utils/gram_collection'
|
9
|
+
require 'categorize/utils/gram_node'
|
10
|
+
require 'categorize/utils/grams'
|
11
|
+
|
12
|
+
require 'categorize/constants'
|
13
|
+
require 'categorize/model'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: categorize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,18 +10,21 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2013-04-14 00:00:00.000000000 Z
|
14
14
|
dependencies: []
|
15
|
-
description:
|
15
|
+
description: ! "A text categorization library that favors performance.\n Built
|
16
|
+
for use in online systems."
|
16
17
|
email: peter@helioid.com
|
17
18
|
executables: []
|
18
19
|
extensions: []
|
19
20
|
extra_rdoc_files: []
|
20
21
|
files:
|
22
|
+
- lib/categorize.rb
|
21
23
|
- lib/categorize/model.rb
|
22
24
|
- lib/categorize/constants.rb
|
23
25
|
- lib/categorize/models/bag_of_words.rb
|
24
|
-
- lib/categorize/
|
26
|
+
- lib/categorize/models/cluster.rb
|
27
|
+
- lib/categorize/models/hierarchical_cluster.rb
|
25
28
|
homepage: http://www.helioid.com/
|
26
29
|
licenses: []
|
27
30
|
post_install_message:
|
@@ -45,5 +48,5 @@ rubyforge_project:
|
|
45
48
|
rubygems_version: 1.8.24
|
46
49
|
signing_key:
|
47
50
|
specification_version: 3
|
48
|
-
summary:
|
51
|
+
summary: A text categorization library.
|
49
52
|
test_files: []
|
@@ -1,46 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Categorize
|
4
|
-
module Utils
|
5
|
-
module Grams
|
6
|
-
def create_grams(query, records_to_words)
|
7
|
-
all_grams = []
|
8
|
-
@query = query
|
9
|
-
@query_terms = query.split.map(&:downcase).map(&:strip)
|
10
|
-
@query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
|
11
|
-
|
12
|
-
invalid = Proc.new do |gram, *args|
|
13
|
-
# remove [[gram]] if == [[query]]
|
14
|
-
gram == @query || gram == @query_alt || @query_terms.include?(gram)
|
15
|
-
end
|
16
|
-
|
17
|
-
gram_collections = records_to_words.map do |record, words|
|
18
|
-
gram_collection = GramCollection.new(record, words, invalid)
|
19
|
-
all_grams += gram_collection.grams
|
20
|
-
gram_collection
|
21
|
-
end
|
22
|
-
return gram_collections, make_grams_unique(all_grams)
|
23
|
-
end
|
24
|
-
|
25
|
-
def check_plurals(frequent_grams)
|
26
|
-
# if exists [[gram]] and [[gram]]s then remove [[gram]]s
|
27
|
-
frequent_grams_contents = frequent_grams.map(&:content)
|
28
|
-
frequent_grams.delete_if do |gram|
|
29
|
-
gram.content[-1] == 's' and
|
30
|
-
frequent_grams_contents.include?(gram.content[0...-1])
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def make_grams_unique(grams)
|
35
|
-
grams.reduce({}) do |hash, gram|
|
36
|
-
if hash[gram.content]
|
37
|
-
hash[gram.content].frequency += gram.frequency
|
38
|
-
else
|
39
|
-
hash[gram.content] = gram
|
40
|
-
end
|
41
|
-
hash
|
42
|
-
end.values
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|