categorize 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,110 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Models
5
+ class Cluster < AbstractModel
6
+
7
+ def initialize
8
+ @num_clusters = 10
9
+ @clusterer = Ai4r::Clusterers::WardLinkage.new
10
+ super
11
+ end
12
+
13
+ def model(query, records_to_tokens)
14
+ @query = query
15
+ dataset = build_vars(records_to_tokens)
16
+ @clusterer.build(dataset, @num_clusters)
17
+ build_categories(@clusterer.clusters)
18
+ end
19
+
20
+ def build_categories(clusters)
21
+ clusters_to_records = Hash[clusters.each_with_index.map do |cluster, i|
22
+ [i, cluster.data_items.map { |v| @vectors.index(v) }]
23
+ end]
24
+
25
+ @query_terms ||= @query.split.map(&:downcase)
26
+
27
+ categories = clusters_to_records.map do |cluster, records|
28
+ term_vectors = records.map { |r| @vectors[r] }.transpose
29
+ tf = term_vectors.map { |f| f.reduce(&:+) }
30
+ get_bigram_max(records, tf)
31
+ end
32
+
33
+ records = clusters_to_records.values
34
+ # merge duplicate labeled categories
35
+ categories_records = []
36
+
37
+ categories.each_with_index do |category, i|
38
+ if j = categories[0...i].index(category) && categories_records[j]
39
+ categories_records[j].last + records.shift
40
+ else
41
+ categories_records << [category, records.shift]
42
+ end
43
+ end
44
+
45
+ categories_records
46
+ end
47
+
48
+ private
49
+ def df(term_vectors)
50
+ term_vectors.map do |f|
51
+ f.reduce { |count, tf| tf > 0 ? count + 1 : count }
52
+ end.flatten
53
+ end
54
+
55
+ def get_bigram_max(records, tf, df = false)
56
+ @bigram_max_cache[[records, tf, df]] ||= bigram_max(records, tf, df)
57
+ end
58
+
59
+ def bigram_max(records, tf, df)
60
+ bigrams = records.map { |r| get_grams(r) }.flatten.uniq
61
+ bigrams.max_by do |b|
62
+ b_terms = b.split
63
+ if b == @query || b_terms.include?(@query) ||
64
+ b_terms.any? { |t| @query_terms.include?(t) }
65
+ 0
66
+ else
67
+ i, j = b_terms.map { |t| @labels.index(t) }
68
+ if df
69
+ df_i, df_j = [i, j].map { |k| df[k] }
70
+ df_i > 0 and df_j > 0 ? (tf[i] / df_i) * (tf[j] / df_j) : 0
71
+ else
72
+ tf[i] * tf[j]
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ def unigram_max(records, tf, df = false)
79
+ cluster_terms = records.map { |r| @tokens[r] }.flatten.uniq
80
+ cluster_terms.max_by do |t|
81
+ if t == @query || @query_terms.include?(t)
82
+ 0
83
+ else
84
+ i = labels.index(t)
85
+ if df
86
+ df_i = df[i]
87
+ df_i > 0 ? tf[i] / df_i : 0
88
+ else
89
+ tf[i]
90
+ end
91
+ end
92
+ end
93
+ end
94
+
95
+ def get_grams(r)
96
+ @gram_cache[r] ||= gramize(@tokens[r])
97
+ end
98
+
99
+ def gramize(tokens)
100
+ last_token = nil
101
+ tokens = tokens.map do |token|
102
+ new_token = last_token and last_token != token
103
+ gram = (new_token) ? "#{last_token} #{token}" : nil
104
+ last_token = token
105
+ gram
106
+ end.compact.uniq
107
+ end
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,27 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Models
5
+ class HierarchicalCluster < Cluster
6
+ def initialize
7
+ super
8
+ @depth = 8
9
+ @clusterer = Ai4r::Clusterers::WardLinkageHierarchical.new(@depth)
10
+ end
11
+
12
+ def model(query, records_to_tokens)
13
+ @query = query
14
+ dataset = build_vars(records_to_tokens)
15
+ @num_clusters = 1
16
+ @clusterer.build(dataset, @num_clusters)
17
+ @num_clusters = 0
18
+ cluster_sets = nil
19
+ cluster_sets = @clusterer.cluster_tree.map do |clusters|
20
+ @num_clusters += 1
21
+ build_categories(clusters)
22
+ end
23
+ cluster_sets
24
+ end
25
+ end
26
+ end
27
+ end
data/lib/categorize.rb ADDED
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+
3
+ require 'categorize/models/abstract_model'
4
+ require 'categorize/models/bag_of_words'
5
+ require 'categorize/models/cluster'
6
+ require 'categorize/models/hierarchical_cluster'
7
+
8
+ require 'categorize/utils/gram_collection'
9
+ require 'categorize/utils/gram_node'
10
+ require 'categorize/utils/grams'
11
+
12
+ require 'categorize/constants'
13
+ require 'categorize/model'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: categorize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -10,18 +10,21 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2012-06-28 00:00:00.000000000 Z
13
+ date: 2013-04-14 00:00:00.000000000 Z
14
14
  dependencies: []
15
- description: Text categorization library
15
+ description: ! "A text categorization library that favors performance.\n Built
16
+ for use in online systems."
16
17
  email: peter@helioid.com
17
18
  executables: []
18
19
  extensions: []
19
20
  extra_rdoc_files: []
20
21
  files:
22
+ - lib/categorize.rb
21
23
  - lib/categorize/model.rb
22
24
  - lib/categorize/constants.rb
23
25
  - lib/categorize/models/bag_of_words.rb
24
- - lib/categorize/utils/grams.rb
26
+ - lib/categorize/models/cluster.rb
27
+ - lib/categorize/models/hierarchical_cluster.rb
25
28
  homepage: http://www.helioid.com/
26
29
  licenses: []
27
30
  post_install_message:
@@ -45,5 +48,5 @@ rubyforge_project:
45
48
  rubygems_version: 1.8.24
46
49
  signing_key:
47
50
  specification_version: 3
48
- summary: Text categorization library
51
+ summary: A text categorization library.
49
52
  test_files: []
@@ -1,46 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Categorize
4
- module Utils
5
- module Grams
6
- def create_grams(query, records_to_words)
7
- all_grams = []
8
- @query = query
9
- @query_terms = query.split.map(&:downcase).map(&:strip)
10
- @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
11
-
12
- invalid = Proc.new do |gram, *args|
13
- # remove [[gram]] if == [[query]]
14
- gram == @query || gram == @query_alt || @query_terms.include?(gram)
15
- end
16
-
17
- gram_collections = records_to_words.map do |record, words|
18
- gram_collection = GramCollection.new(record, words, invalid)
19
- all_grams += gram_collection.grams
20
- gram_collection
21
- end
22
- return gram_collections, make_grams_unique(all_grams)
23
- end
24
-
25
- def check_plurals(frequent_grams)
26
- # if exists [[gram]] and [[gram]]s then remove [[gram]]s
27
- frequent_grams_contents = frequent_grams.map(&:content)
28
- frequent_grams.delete_if do |gram|
29
- gram.content[-1] == 's' and
30
- frequent_grams_contents.include?(gram.content[0...-1])
31
- end
32
- end
33
-
34
- def make_grams_unique(grams)
35
- grams.reduce({}) do |hash, gram|
36
- if hash[gram.content]
37
- hash[gram.content].frequency += gram.frequency
38
- else
39
- hash[gram.content] = gram
40
- end
41
- hash
42
- end.values
43
- end
44
- end
45
- end
46
- end