RubyGems - categorize - Versions diffs - 0.0.4 → 0.0.5 - Mend

categorize 0.0.4 → 0.0.5

Files changed (6) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    OGNkMmQ5MzEwZGFlOWUxNWM0MzU0MTI0MTI2NzE5NTBlNGZjYzM3Ng==
+  data.tar.gz: !binary |-
+    YmNmMDE5NWMxYmZhNWI0ZDI2NDA3MjdkOTNjYmI2MGUzMWY0ZTVjZQ==
+!binary "U0hBNTEy":
+  metadata.gz: !binary |-
+    ZTdiM2IyMzRiOTg1Y2Y5MDc2ZWQwY2EyYjA3YTZjODEzYmM5MTU5NWVlNzBl
+    ZDdmYzhiNzdiOTYxOGY3YzgzNWFmZDhmMmIxODczZmY1NGM2MmM2NzI5NzVi
+    NWYzMGMwOGI2MWI5Mjk5NmY4MmMwM2YyZWFjNzU1MGMxMjcwYWI=
+  data.tar.gz: !binary |-
+    MjQ1NWQ4ZGVlMzNjZDZkNDVmODViOTY1ZTM4ZGZlYjhjMGVmNDQ4ZGRiNmRm
+    MGY1OTNhN2NkMzQ3Y2U4OGIyMDc3MTU2ZTc5MTE0ZGE4NTc4ODg2MGE5MjRm
+    N2M3MWQ4YzJhYzFjNTNjZTNjNDA3ZjVlM2RmZDVkMTcxNTFkNDM=

data/lib/categorize/models/abstract_model.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# encoding: utf-8
+module Categorize
+  module Models
+    class AbstractModel
+      require 'ai4r'
+      def initialize
+        @gram_cache = Hash.new(nil)
+        @bigram_max_cache = Hash.new(nil)
+      end
+      def build_vars(records_to_tokens)
+        @tokens = records_to_tokens.values
+        @labels, @vectors = vectorize(@tokens)
+        build_dataset(@labels, @vectors)
+      end
+      def vectorize(token_groups)
+        labels = token_groups.flatten.uniq
+        vectors = token_groups.reduce([]) do |ary, tokens|
+          items = Array.new(labels.length, 0)
+          labels.each_with_index do |token, i|
+            items[i] = tokens.count(token)
+          end
+          ary << items
+        end
+        [labels, vectors]
+      end
+      def build_dataset(labels, vectors)
+        Ai4r::Data::DataSet.new(data_items: vectors, data_labels: labels)
+      end
+    end
+  end
+end

data/lib/categorize/utils/gram_collection.rb ADDED Viewed

@@ -0,0 +1,47 @@
+# encoding: utf-8
+module Categorize
+  module Utils
+    class GramCollection
+      attr_reader :grams, :content_to_frequency, :content
+      attr_accessor :fitness
+      def initialize(content, words, invalid)
+        @fitness = {}
+        @content = content
+        @invalid = invalid
+        # TODO: n grammify this
+        last_word = nil
+        last_2nd_word = nil
+        @grams = {}
+        @content_to_frequency = words.reduce({}) do |hash, word|
+          bigram = trigram = nil
+          if last_word && last_word != word
+            bigram = "#{last_word} #{word}"
+            if last_2nd_word && word != last_2nd_word
+              trigram = "#{last_2nd_word} #{bigram}"
+            end
+          end
+          [word, bigram, trigram].compact.each do |gram|
+            next if @invalid.call(gram)
+            if hash[gram]
+              hash[gram] += 1
+              @grams[gram].frequency += 1
+            else
+              hash[gram] = 1
+              @grams[gram] = GramNode.new(self, gram, 1)
+            end
+          end
+          last_2nd_word = last_word
+          last_word = word
+          hash
+        end
+        @grams = @grams.values
+      end
+    end
+  end
+end

data/lib/categorize/utils/gram_node.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# encoding: utf-8
+module Categorize
+  module Utils
+    class GramNode
+      attr_reader :content, :gram_collection
+      attr_accessor :frequency
+      def initialize(gram_collection, content, frequency = 0)
+        @gram_group = gram_collection
+        @content = content
+        @frequency = frequency
+      end
+    end
+  end
+end

data/lib/categorize/utils/grams.rb ADDED Viewed

@@ -0,0 +1,46 @@
+# encoding: utf-8
+module Categorize
+  module Utils
+    module Grams
+      def create_grams(query, records_to_words)
+        all_grams = []
+        @query = query
+        @query_terms = query.split.map(&:downcase).map(&:strip)
+        @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
+        invalid = Proc.new do |gram, *args|
+          # remove [[gram]] if == [[query]]
+          gram == @query || gram == @query_alt || @query_terms.include?(gram)
+        end
+        gram_collections = records_to_words.map do |record, words|
+          gram_collection = GramCollection.new(record, words, invalid)
+          all_grams += gram_collection.grams
+          gram_collection
+        end
+        return gram_collections, make_grams_unique(all_grams)
+      end
+      def check_plurals(frequent_grams)
+        # if exists [[gram]] and [[gram]]s then remove [[gram]]s
+        frequent_grams_contents = frequent_grams.map(&:content)
+        frequent_grams.delete_if do |gram|
+          gram.content[-1] == 's' and
+            frequent_grams_contents.include?(gram.content[0...-1])
+        end
+      end
+      def make_grams_unique(grams)
+        grams.reduce({}) do |hash, gram|
+          if hash[gram.content]
+            hash[gram.content].frequency += gram.frequency
+          else
+            hash[gram.content] = gram
+          end
+          hash
+        end.values
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,8 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: categorize
 version: !ruby/object:Gem::Version
-  version: 0.0.4
-  prerelease:
+  version: 0.0.5
 platform: ruby
 authors:
 - Peter Lubell-Doughtie
@@ -22,31 +21,34 @@ files:
 - lib/categorize.rb
 - lib/categorize/model.rb
 - lib/categorize/constants.rb
+- lib/categorize/models/abstract_model.rb
 - lib/categorize/models/bag_of_words.rb
 - lib/categorize/models/cluster.rb
 - lib/categorize/models/hierarchical_cluster.rb
+- lib/categorize/utils/gram_collection.rb
+- lib/categorize/utils/gram_node.rb
+- lib/categorize/utils/grams.rb
 homepage: http://www.helioid.com/
 licenses: []
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.24
+rubygems_version: 2.0.3
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: A text categorization library.
 test_files: []