categorize 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,110 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Models
5
+ class Cluster < AbstractModel
6
+
7
+ def initialize
8
+ @num_clusters = 10
9
+ @clusterer = Ai4r::Clusterers::WardLinkage.new
10
+ super
11
+ end
12
+
13
+ def model(query, records_to_tokens)
14
+ @query = query
15
+ dataset = build_vars(records_to_tokens)
16
+ @clusterer.build(dataset, @num_clusters)
17
+ build_categories(@clusterer.clusters)
18
+ end
19
+
20
+ def build_categories(clusters)
21
+ clusters_to_records = Hash[clusters.each_with_index.map do |cluster, i|
22
+ [i, cluster.data_items.map { |v| @vectors.index(v) }]
23
+ end]
24
+
25
+ @query_terms ||= @query.split.map(&:downcase)
26
+
27
+ categories = clusters_to_records.map do |cluster, records|
28
+ term_vectors = records.map { |r| @vectors[r] }.transpose
29
+ tf = term_vectors.map { |f| f.reduce(&:+) }
30
+ get_bigram_max(records, tf)
31
+ end
32
+
33
+ records = clusters_to_records.values
34
+ # merge duplicate labeled categories
35
+ categories_records = []
36
+
37
+ categories.each_with_index do |category, i|
38
+ if j = categories[0...i].index(category) && categories_records[j]
39
+ categories_records[j].last + records.shift
40
+ else
41
+ categories_records << [category, records.shift]
42
+ end
43
+ end
44
+
45
+ categories_records
46
+ end
47
+
48
+ private
49
+ def df(term_vectors)
50
+ term_vectors.map do |f|
51
+ f.reduce { |count, tf| tf > 0 ? count + 1 : count }
52
+ end.flatten
53
+ end
54
+
55
+ def get_bigram_max(records, tf, df = false)
56
+ @bigram_max_cache[[records, tf, df]] ||= bigram_max(records, tf, df)
57
+ end
58
+
59
+ def bigram_max(records, tf, df)
60
+ bigrams = records.map { |r| get_grams(r) }.flatten.uniq
61
+ bigrams.max_by do |b|
62
+ b_terms = b.split
63
+ if b == @query || b_terms.include?(@query) ||
64
+ b_terms.any? { |t| @query_terms.include?(t) }
65
+ 0
66
+ else
67
+ i, j = b_terms.map { |t| @labels.index(t) }
68
+ if df
69
+ df_i, df_j = [i, j].map { |k| df[k] }
70
+ df_i > 0 and df_j > 0 ? (tf[i] / df_i) * (tf[j] / df_j) : 0
71
+ else
72
+ tf[i] * tf[j]
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ def unigram_max(records, tf, df = false)
79
+ cluster_terms = records.map { |r| @tokens[r] }.flatten.uniq
80
+ cluster_terms.max_by do |t|
81
+ if t == @query || @query_terms.include?(t)
82
+ 0
83
+ else
84
+ i = labels.index(t)
85
+ if df
86
+ df_i = df[i]
87
+ df_i > 0 ? tf[i] / df_i : 0
88
+ else
89
+ tf[i]
90
+ end
91
+ end
92
+ end
93
+ end
94
+
95
+ def get_grams(r)
96
+ @gram_cache[r] ||= gramize(@tokens[r])
97
+ end
98
+
99
+ def gramize(tokens)
100
+ last_token = nil
101
+ tokens = tokens.map do |token|
102
+ new_token = last_token and last_token != token
103
+ gram = (new_token) ? "#{last_token} #{token}" : nil
104
+ last_token = token
105
+ gram
106
+ end.compact.uniq
107
+ end
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,27 @@
1
+ # encoding: utf-8
2
+
3
+ module Categorize
4
+ module Models
5
+ class HierarchicalCluster < Cluster
6
+ def initialize
7
+ super
8
+ @depth = 8
9
+ @clusterer = Ai4r::Clusterers::WardLinkageHierarchical.new(@depth)
10
+ end
11
+
12
+ def model(query, records_to_tokens)
13
+ @query = query
14
+ dataset = build_vars(records_to_tokens)
15
+ @num_clusters = 1
16
+ @clusterer.build(dataset, @num_clusters)
17
+ @num_clusters = 0
18
+ cluster_sets = nil
19
+ cluster_sets = @clusterer.cluster_tree.map do |clusters|
20
+ @num_clusters += 1
21
+ build_categories(clusters)
22
+ end
23
+ cluster_sets
24
+ end
25
+ end
26
+ end
27
+ end
data/lib/categorize.rb ADDED
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+
3
+ require 'categorize/models/abstract_model'
4
+ require 'categorize/models/bag_of_words'
5
+ require 'categorize/models/cluster'
6
+ require 'categorize/models/hierarchical_cluster'
7
+
8
+ require 'categorize/utils/gram_collection'
9
+ require 'categorize/utils/gram_node'
10
+ require 'categorize/utils/grams'
11
+
12
+ require 'categorize/constants'
13
+ require 'categorize/model'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: categorize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -10,18 +10,21 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2012-06-28 00:00:00.000000000 Z
13
+ date: 2013-04-14 00:00:00.000000000 Z
14
14
  dependencies: []
15
- description: Text categorization library
15
+ description: ! "A text categorization library that favors performance.\n Built
16
+ for use in online systems."
16
17
  email: peter@helioid.com
17
18
  executables: []
18
19
  extensions: []
19
20
  extra_rdoc_files: []
20
21
  files:
22
+ - lib/categorize.rb
21
23
  - lib/categorize/model.rb
22
24
  - lib/categorize/constants.rb
23
25
  - lib/categorize/models/bag_of_words.rb
24
- - lib/categorize/utils/grams.rb
26
+ - lib/categorize/models/cluster.rb
27
+ - lib/categorize/models/hierarchical_cluster.rb
25
28
  homepage: http://www.helioid.com/
26
29
  licenses: []
27
30
  post_install_message:
@@ -45,5 +48,5 @@ rubyforge_project:
45
48
  rubygems_version: 1.8.24
46
49
  signing_key:
47
50
  specification_version: 3
48
- summary: Text categorization library
51
+ summary: A text categorization library.
49
52
  test_files: []
@@ -1,46 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Categorize
4
- module Utils
5
- module Grams
6
- def create_grams(query, records_to_words)
7
- all_grams = []
8
- @query = query
9
- @query_terms = query.split.map(&:downcase).map(&:strip)
10
- @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
11
-
12
- invalid = Proc.new do |gram, *args|
13
- # remove [[gram]] if == [[query]]
14
- gram == @query || gram == @query_alt || @query_terms.include?(gram)
15
- end
16
-
17
- gram_collections = records_to_words.map do |record, words|
18
- gram_collection = GramCollection.new(record, words, invalid)
19
- all_grams += gram_collection.grams
20
- gram_collection
21
- end
22
- return gram_collections, make_grams_unique(all_grams)
23
- end
24
-
25
- def check_plurals(frequent_grams)
26
- # if exists [[gram]] and [[gram]]s then remove [[gram]]s
27
- frequent_grams_contents = frequent_grams.map(&:content)
28
- frequent_grams.delete_if do |gram|
29
- gram.content[-1] == 's' and
30
- frequent_grams_contents.include?(gram.content[0...-1])
31
- end
32
- end
33
-
34
- def make_grams_unique(grams)
35
- grams.reduce({}) do |hash, gram|
36
- if hash[gram.content]
37
- hash[gram.content].frequency += gram.frequency
38
- else
39
- hash[gram.content] = gram
40
- end
41
- hash
42
- end.values
43
- end
44
- end
45
- end
46
- end