RubyGems - categorize - Versions diffs - 0.0.3 → 0.0.4 - Mend

categorize 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/lib/categorize/models/cluster.rb +110 -0
data/lib/categorize/models/hierarchical_cluster.rb +27 -0
data/lib/categorize.rb +13 -0
metadata +8 -5
data/lib/categorize/utils/grams.rb +0 -46

data/lib/categorize/models/cluster.rb ADDED Viewed

@@ -0,0 +1,110 @@
+# encoding: utf-8
+module Categorize
+  module Models
+    class Cluster < AbstractModel
+      def initialize
+        @num_clusters = 10
+        @clusterer = Ai4r::Clusterers::WardLinkage.new
+        super
+      end
+      def model(query, records_to_tokens)
+        @query = query
+        dataset = build_vars(records_to_tokens)
+        @clusterer.build(dataset, @num_clusters)
+        build_categories(@clusterer.clusters)
+      end
+      def build_categories(clusters)
+        clusters_to_records = Hash[clusters.each_with_index.map do |cluster, i|
+          [i, cluster.data_items.map { |v| @vectors.index(v) }]
+        end]
+        @query_terms ||= @query.split.map(&:downcase)
+        categories = clusters_to_records.map do |cluster, records|
+          term_vectors = records.map { |r| @vectors[r] }.transpose
+          tf = term_vectors.map { |f| f.reduce(&:+) }
+          get_bigram_max(records, tf)
+        end
+        records = clusters_to_records.values
+        # merge duplicate labeled categories
+        categories_records = []
+        categories.each_with_index do |category, i|
+          if j = categories[0...i].index(category) && categories_records[j]
+            categories_records[j].last + records.shift
+          else
+            categories_records << [category, records.shift]
+          end
+        end
+        categories_records
+      end
+      private
+        def df(term_vectors)
+          term_vectors.map do |f|
+            f.reduce { |count, tf| tf > 0 ? count + 1 : count }
+          end.flatten
+        end
+        def get_bigram_max(records, tf, df = false)
+          @bigram_max_cache[[records, tf, df]] ||= bigram_max(records, tf, df)
+        end
+        def bigram_max(records, tf, df)
+          bigrams = records.map { |r| get_grams(r) }.flatten.uniq
+          bigrams.max_by do |b|
+            b_terms = b.split
+            if b == @query || b_terms.include?(@query) ||
+              b_terms.any? { |t| @query_terms.include?(t) }
+              0
+            else
+              i, j = b_terms.map { |t| @labels.index(t) }
+              if df
+                df_i, df_j = [i, j].map { |k| df[k] }
+                df_i > 0 and df_j > 0 ? (tf[i] / df_i) * (tf[j] / df_j) : 0
+              else
+                tf[i] * tf[j]
+              end
+            end
+          end
+        end
+        def unigram_max(records, tf, df = false)
+          cluster_terms = records.map { |r| @tokens[r] }.flatten.uniq
+          cluster_terms.max_by do |t|
+            if t == @query || @query_terms.include?(t)
+              0
+            else
+              i = labels.index(t)
+              if df
+                df_i = df[i]
+                df_i > 0 ? tf[i] / df_i : 0
+              else
+                tf[i]
+              end
+            end
+          end
+        end
+        def get_grams(r)
+          @gram_cache[r] ||= gramize(@tokens[r])
+        end
+        def gramize(tokens)
+          last_token = nil
+          tokens = tokens.map do |token|
+            new_token = last_token and last_token != token
+            gram = (new_token) ? "#{last_token} #{token}" : nil
+            last_token = token
+            gram
+          end.compact.uniq
+        end
+    end
+  end
+end

data/lib/categorize/models/hierarchical_cluster.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# encoding: utf-8
+module Categorize
+  module Models
+    class HierarchicalCluster < Cluster
+      def initialize
+        super
+        @depth = 8
+        @clusterer = Ai4r::Clusterers::WardLinkageHierarchical.new(@depth)
+      end
+      def model(query, records_to_tokens)
+        @query = query
+        dataset = build_vars(records_to_tokens)
+        @num_clusters = 1
+        @clusterer.build(dataset, @num_clusters)
+        @num_clusters = 0
+        cluster_sets = nil
+        cluster_sets = @clusterer.cluster_tree.map do |clusters|
+          @num_clusters += 1
+          build_categories(clusters)
+        end
+        cluster_sets
+      end
+    end
+  end
+end

data/lib/categorize.rb ADDED Viewed

@@ -0,0 +1,13 @@
+# encoding: utf-8
+require 'categorize/models/abstract_model'
+require 'categorize/models/bag_of_words'
+require 'categorize/models/cluster'
+require 'categorize/models/hierarchical_cluster'
+require 'categorize/utils/gram_collection'
+require 'categorize/utils/gram_node'
+require 'categorize/utils/grams'
+require 'categorize/constants'
+require 'categorize/model'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: categorize
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
   prerelease:
 platform: ruby
 authors:
@@ -10,18 +10,21 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-06-28 00:00:00.000000000 Z
+date: 2013-04-14 00:00:00.000000000 Z
 dependencies: []
-description: Text categorization library
+description: ! "A text categorization library that favors performance.\n                      Built
+  for use in online systems."
 email: peter@helioid.com
 executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- lib/categorize.rb
 - lib/categorize/model.rb
 - lib/categorize/constants.rb
 - lib/categorize/models/bag_of_words.rb
-- lib/categorize/utils/grams.rb
+- lib/categorize/models/cluster.rb
+- lib/categorize/models/hierarchical_cluster.rb
 homepage: http://www.helioid.com/
 licenses: []
 post_install_message:
@@ -45,5 +48,5 @@ rubyforge_project:
 rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
-summary: Text categorization library
+summary: A text categorization library.
 test_files: []

data/lib/categorize/utils/grams.rb DELETED Viewed

@@ -1,46 +0,0 @@
-# encoding: utf-8
-module Categorize
-  module Utils
-    module Grams
-      def create_grams(query, records_to_words)
-        all_grams = []
-        @query = query
-        @query_terms = query.split.map(&:downcase).map(&:strip)
-        @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
-        invalid = Proc.new do |gram, *args|
-          # remove [[gram]] if == [[query]]
-          gram == @query || gram == @query_alt || @query_terms.include?(gram)
-        end
-        gram_collections = records_to_words.map do |record, words|
-          gram_collection = GramCollection.new(record, words, invalid)
-          all_grams += gram_collection.grams
-          gram_collection
-        end
-        return gram_collections, make_grams_unique(all_grams)
-      end
-      def check_plurals(frequent_grams)
-        # if exists [[gram]] and [[gram]]s then remove [[gram]]s
-        frequent_grams_contents = frequent_grams.map(&:content)
-        frequent_grams.delete_if do |gram|
-          gram.content[-1] == 's' and
-            frequent_grams_contents.include?(gram.content[0...-1])
-        end
-      end
-      def make_grams_unique(grams)
-        grams.reduce({}) do |hash, gram|
-          if hash[gram.content]
-            hash[gram.content].frequency += gram.frequency
-          else
-            hash[gram.content] = gram
-          end
-          hash
-        end.values
-      end
-    end
-  end
-end