RubyGems - categorize - Versions diffs - 0.0.3 → 0.0.4 - Mend

categorize 0.0.3 → 0.0.4

Files changed (5) hide show

data/lib/categorize/models/cluster.rb +110 -0
data/lib/categorize/models/hierarchical_cluster.rb +27 -0
data/lib/categorize.rb +13 -0
metadata +8 -5
data/lib/categorize/utils/grams.rb +0 -46

data/lib/categorize/models/cluster.rb ADDED Viewed

@@ -0,0 +1,110 @@
+# encoding: utf-8
+module Categorize
+  module Models
+    class Cluster < AbstractModel
+      def initialize
+        @num_clusters = 10
+        @clusterer = Ai4r::Clusterers::WardLinkage.new
+        super
+      end
+      def model(query, records_to_tokens)
+        @query = query
+        dataset = build_vars(records_to_tokens)
+        @clusterer.build(dataset, @num_clusters)
+        build_categories(@clusterer.clusters)
+      end
+      def build_categories(clusters)
+        clusters_to_records = Hash[clusters.each_with_index.map do |cluster, i|
+          [i, cluster.data_items.map { |v| @vectors.index(v) }]
+        end]
+        @query_terms ||= @query.split.map(&:downcase)
+        categories = clusters_to_records.map do |cluster, records|
+          term_vectors = records.map { |r| @vectors[r] }.transpose
+          tf = term_vectors.map { |f| f.reduce(&:+) }
+          get_bigram_max(records, tf)
+        end
+        records = clusters_to_records.values
+        # merge duplicate labeled categories
+        categories_records = []
+        categories.each_with_index do |category, i|
+          if j = categories[0...i].index(category) && categories_records[j]
+            categories_records[j].last + records.shift
+          else
+            categories_records << [category, records.shift]
+          end
+        end
+        categories_records
+      end
+      private
+        def df(term_vectors)
+          term_vectors.map do |f|
+            f.reduce { |count, tf| tf > 0 ? count + 1 : count }
+          end.flatten
+        end
+        def get_bigram_max(records, tf, df = false)
+          @bigram_max_cache[[records, tf, df]] ||= bigram_max(records, tf, df)
+        end
+        def bigram_max(records, tf, df)
+          bigrams = records.map { |r| get_grams(r) }.flatten.uniq
+          bigrams.max_by do |b|
+            b_terms = b.split
+            if b == @query || b_terms.include?(@query) ||
+              b_terms.any? { |t| @query_terms.include?(t) }
+              0
+            else
+              i, j = b_terms.map { |t| @labels.index(t) }
+              if df
+                df_i, df_j = [i, j].map { |k| df[k] }
+                df_i > 0 and df_j > 0 ? (tf[i] / df_i) * (tf[j] / df_j) : 0
+              else
+                tf[i] * tf[j]
+              end
+            end
+          end
+        end
+        def unigram_max(records, tf, df = false)
+          cluster_terms = records.map { |r| @tokens[r] }.flatten.uniq
+          cluster_terms.max_by do |t|
+            if t == @query || @query_terms.include?(t)
+              0
+            else
+              i = labels.index(t)
+              if df
+                df_i = df[i]
+                df_i > 0 ? tf[i] / df_i : 0
+              else
+                tf[i]
+              end
+            end
+          end
+        end
+        def get_grams(r)
+          @gram_cache[r] ||= gramize(@tokens[r])
+        end
+        def gramize(tokens)
+          last_token = nil
+          tokens = tokens.map do |token|
+            new_token = last_token and last_token != token
+            gram = (new_token) ? "#{last_token} #{token}" : nil
+            last_token = token
+            gram
+          end.compact.uniq
+        end
+    end
+  end
+end

data/lib/categorize/models/hierarchical_cluster.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# encoding: utf-8
+module Categorize
+  module Models
+    class HierarchicalCluster < Cluster
+      def initialize
+        super
+        @depth = 8
+        @clusterer = Ai4r::Clusterers::WardLinkageHierarchical.new(@depth)
+      end
+      def model(query, records_to_tokens)
+        @query = query
+        dataset = build_vars(records_to_tokens)
+        @num_clusters = 1
+        @clusterer.build(dataset, @num_clusters)
+        @num_clusters = 0
+        cluster_sets = nil
+        cluster_sets = @clusterer.cluster_tree.map do |clusters|
+          @num_clusters += 1
+          build_categories(clusters)
+        end
+        cluster_sets
+      end
+    end
+  end
+end

data/lib/categorize.rb ADDED Viewed

@@ -0,0 +1,13 @@
+# encoding: utf-8
+require 'categorize/models/abstract_model'
+require 'categorize/models/bag_of_words'
+require 'categorize/models/cluster'
+require 'categorize/models/hierarchical_cluster'
+require 'categorize/utils/gram_collection'
+require 'categorize/utils/gram_node'
+require 'categorize/utils/grams'
+require 'categorize/constants'
+require 'categorize/model'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: categorize
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
   prerelease:
 platform: ruby
 authors:
@@ -10,18 +10,21 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-06-28 00:00:00.000000000 Z
+date: 2013-04-14 00:00:00.000000000 Z
 dependencies: []
-description: Text categorization library
+description: ! "A text categorization library that favors performance.\n                      Built
+  for use in online systems."
 email: peter@helioid.com
 executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- lib/categorize.rb
 - lib/categorize/model.rb
 - lib/categorize/constants.rb
 - lib/categorize/models/bag_of_words.rb
-- lib/categorize/utils/grams.rb
+- lib/categorize/models/cluster.rb
+- lib/categorize/models/hierarchical_cluster.rb
 homepage: http://www.helioid.com/
 licenses: []
 post_install_message:
@@ -45,5 +48,5 @@ rubyforge_project:
 rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
-summary: Text categorization library
+summary: A text categorization library.
 test_files: []

data/lib/categorize/utils/grams.rb DELETED Viewed

@@ -1,46 +0,0 @@
-# encoding: utf-8
-module Categorize
-  module Utils
-    module Grams
-      def create_grams(query, records_to_words)
-        all_grams = []
-        @query = query
-        @query_terms = query.split.map(&:downcase).map(&:strip)
-        @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
-        invalid = Proc.new do |gram, *args|
-          # remove [[gram]] if == [[query]]
-          gram == @query || gram == @query_alt || @query_terms.include?(gram)
-        end
-        gram_collections = records_to_words.map do |record, words|
-          gram_collection = GramCollection.new(record, words, invalid)
-          all_grams += gram_collection.grams
-          gram_collection
-        end
-        return gram_collections, make_grams_unique(all_grams)
-      end
-      def check_plurals(frequent_grams)
-        # if exists [[gram]] and [[gram]]s then remove [[gram]]s
-        frequent_grams_contents = frequent_grams.map(&:content)
-        frequent_grams.delete_if do |gram|
-          gram.content[-1] == 's' and
-            frequent_grams_contents.include?(gram.content[0...-1])
-        end
-      end
-      def make_grams_unique(grams)
-        grams.reduce({}) do |hash, gram|
-          if hash[gram.content]
-            hash[gram.content].frequency += gram.frequency
-          else
-            hash[gram.content] = gram
-          end
-          hash
-        end.values
-      end
-    end
-  end
-end