categorize 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
 - data/lib/categorize/models/abstract_model.rb +36 -0
 - data/lib/categorize/utils/gram_collection.rb +47 -0
 - data/lib/categorize/utils/gram_node.rb +16 -0
 - data/lib/categorize/utils/grams.rb +46 -0
 - metadata +8 -6
 
    
        checksums.yaml
    ADDED
    
    | 
         @@ -0,0 +1,15 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ---
         
     | 
| 
      
 2 
     | 
    
         
            +
            !binary "U0hBMQ==":
         
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: !binary |-
         
     | 
| 
      
 4 
     | 
    
         
            +
                OGNkMmQ5MzEwZGFlOWUxNWM0MzU0MTI0MTI2NzE5NTBlNGZjYzM3Ng==
         
     | 
| 
      
 5 
     | 
    
         
            +
              data.tar.gz: !binary |-
         
     | 
| 
      
 6 
     | 
    
         
            +
                YmNmMDE5NWMxYmZhNWI0ZDI2NDA3MjdkOTNjYmI2MGUzMWY0ZTVjZQ==
         
     | 
| 
      
 7 
     | 
    
         
            +
            !binary "U0hBNTEy":
         
     | 
| 
      
 8 
     | 
    
         
            +
              metadata.gz: !binary |-
         
     | 
| 
      
 9 
     | 
    
         
            +
                ZTdiM2IyMzRiOTg1Y2Y5MDc2ZWQwY2EyYjA3YTZjODEzYmM5MTU5NWVlNzBl
         
     | 
| 
      
 10 
     | 
    
         
            +
                ZDdmYzhiNzdiOTYxOGY3YzgzNWFmZDhmMmIxODczZmY1NGM2MmM2NzI5NzVi
         
     | 
| 
      
 11 
     | 
    
         
            +
                NWYzMGMwOGI2MWI5Mjk5NmY4MmMwM2YyZWFjNzU1MGMxMjcwYWI=
         
     | 
| 
      
 12 
     | 
    
         
            +
              data.tar.gz: !binary |-
         
     | 
| 
      
 13 
     | 
    
         
            +
                MjQ1NWQ4ZGVlMzNjZDZkNDVmODViOTY1ZTM4ZGZlYjhjMGVmNDQ4ZGRiNmRm
         
     | 
| 
      
 14 
     | 
    
         
            +
                MGY1OTNhN2NkMzQ3Y2U4OGIyMDc3MTU2ZTc5MTE0ZGE4NTc4ODg2MGE5MjRm
         
     | 
| 
      
 15 
     | 
    
         
            +
                N2M3MWQ4YzJhYzFjNTNjZTNjNDA3ZjVlM2RmZDVkMTcxNTFkNDM=
         
     | 
| 
         @@ -0,0 +1,36 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Categorize
         
     | 
| 
      
 4 
     | 
    
         
            +
              module Models
         
     | 
| 
      
 5 
     | 
    
         
            +
                class AbstractModel
         
     | 
| 
      
 6 
     | 
    
         
            +
                  require 'ai4r'
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
                  def initialize
         
     | 
| 
      
 9 
     | 
    
         
            +
                    @gram_cache = Hash.new(nil)
         
     | 
| 
      
 10 
     | 
    
         
            +
                    @bigram_max_cache = Hash.new(nil)
         
     | 
| 
      
 11 
     | 
    
         
            +
                  end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                  def build_vars(records_to_tokens)
         
     | 
| 
      
 14 
     | 
    
         
            +
                    @tokens = records_to_tokens.values
         
     | 
| 
      
 15 
     | 
    
         
            +
                    @labels, @vectors = vectorize(@tokens)
         
     | 
| 
      
 16 
     | 
    
         
            +
                    build_dataset(@labels, @vectors)
         
     | 
| 
      
 17 
     | 
    
         
            +
                  end
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                  def vectorize(token_groups)
         
     | 
| 
      
 20 
     | 
    
         
            +
                    labels = token_groups.flatten.uniq
         
     | 
| 
      
 21 
     | 
    
         
            +
                    vectors = token_groups.reduce([]) do |ary, tokens|
         
     | 
| 
      
 22 
     | 
    
         
            +
                      items = Array.new(labels.length, 0)
         
     | 
| 
      
 23 
     | 
    
         
            +
                      labels.each_with_index do |token, i|
         
     | 
| 
      
 24 
     | 
    
         
            +
                        items[i] = tokens.count(token)
         
     | 
| 
      
 25 
     | 
    
         
            +
                      end
         
     | 
| 
      
 26 
     | 
    
         
            +
                      ary << items
         
     | 
| 
      
 27 
     | 
    
         
            +
                    end
         
     | 
| 
      
 28 
     | 
    
         
            +
                    [labels, vectors]
         
     | 
| 
      
 29 
     | 
    
         
            +
                  end
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                  def build_dataset(labels, vectors)
         
     | 
| 
      
 32 
     | 
    
         
            +
                    Ai4r::Data::DataSet.new(data_items: vectors, data_labels: labels)
         
     | 
| 
      
 33 
     | 
    
         
            +
                  end
         
     | 
| 
      
 34 
     | 
    
         
            +
                end
         
     | 
| 
      
 35 
     | 
    
         
            +
              end
         
     | 
| 
      
 36 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,47 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Categorize
         
     | 
| 
      
 4 
     | 
    
         
            +
              module Utils
         
     | 
| 
      
 5 
     | 
    
         
            +
                class GramCollection
         
     | 
| 
      
 6 
     | 
    
         
            +
                  attr_reader :grams, :content_to_frequency, :content
         
     | 
| 
      
 7 
     | 
    
         
            +
                  attr_accessor :fitness
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
                  def initialize(content, words, invalid)
         
     | 
| 
      
 10 
     | 
    
         
            +
                    @fitness = {}
         
     | 
| 
      
 11 
     | 
    
         
            +
                    @content = content
         
     | 
| 
      
 12 
     | 
    
         
            +
                    @invalid = invalid
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                    # TODO: n grammify this
         
     | 
| 
      
 15 
     | 
    
         
            +
                    last_word = nil
         
     | 
| 
      
 16 
     | 
    
         
            +
                    last_2nd_word = nil
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                    @grams = {}
         
     | 
| 
      
 19 
     | 
    
         
            +
                    @content_to_frequency = words.reduce({}) do |hash, word|
         
     | 
| 
      
 20 
     | 
    
         
            +
                      bigram = trigram = nil
         
     | 
| 
      
 21 
     | 
    
         
            +
                      if last_word && last_word != word
         
     | 
| 
      
 22 
     | 
    
         
            +
                        bigram = "#{last_word} #{word}"
         
     | 
| 
      
 23 
     | 
    
         
            +
                        if last_2nd_word && word != last_2nd_word
         
     | 
| 
      
 24 
     | 
    
         
            +
                          trigram = "#{last_2nd_word} #{bigram}"
         
     | 
| 
      
 25 
     | 
    
         
            +
                        end
         
     | 
| 
      
 26 
     | 
    
         
            +
                      end
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
                      [word, bigram, trigram].compact.each do |gram|
         
     | 
| 
      
 29 
     | 
    
         
            +
                        next if @invalid.call(gram)
         
     | 
| 
      
 30 
     | 
    
         
            +
                        if hash[gram]
         
     | 
| 
      
 31 
     | 
    
         
            +
                          hash[gram] += 1
         
     | 
| 
      
 32 
     | 
    
         
            +
                          @grams[gram].frequency += 1
         
     | 
| 
      
 33 
     | 
    
         
            +
                        else
         
     | 
| 
      
 34 
     | 
    
         
            +
                          hash[gram] = 1
         
     | 
| 
      
 35 
     | 
    
         
            +
                          @grams[gram] = GramNode.new(self, gram, 1)
         
     | 
| 
      
 36 
     | 
    
         
            +
                        end
         
     | 
| 
      
 37 
     | 
    
         
            +
                      end
         
     | 
| 
      
 38 
     | 
    
         
            +
                      last_2nd_word = last_word
         
     | 
| 
      
 39 
     | 
    
         
            +
                      last_word = word
         
     | 
| 
      
 40 
     | 
    
         
            +
                      hash
         
     | 
| 
      
 41 
     | 
    
         
            +
                    end
         
     | 
| 
      
 42 
     | 
    
         
            +
             
     | 
| 
      
 43 
     | 
    
         
            +
                    @grams = @grams.values
         
     | 
| 
      
 44 
     | 
    
         
            +
                  end
         
     | 
| 
      
 45 
     | 
    
         
            +
                end
         
     | 
| 
      
 46 
     | 
    
         
            +
              end
         
     | 
| 
      
 47 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,16 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Categorize
         
     | 
| 
      
 4 
     | 
    
         
            +
              module Utils
         
     | 
| 
      
 5 
     | 
    
         
            +
                class GramNode
         
     | 
| 
      
 6 
     | 
    
         
            +
                  attr_reader :content, :gram_collection
         
     | 
| 
      
 7 
     | 
    
         
            +
                  attr_accessor :frequency
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
                  def initialize(gram_collection, content, frequency = 0)
         
     | 
| 
      
 10 
     | 
    
         
            +
                    @gram_group = gram_collection
         
     | 
| 
      
 11 
     | 
    
         
            +
                    @content = content
         
     | 
| 
      
 12 
     | 
    
         
            +
                    @frequency = frequency
         
     | 
| 
      
 13 
     | 
    
         
            +
                  end
         
     | 
| 
      
 14 
     | 
    
         
            +
                end
         
     | 
| 
      
 15 
     | 
    
         
            +
              end
         
     | 
| 
      
 16 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,46 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Categorize
         
     | 
| 
      
 4 
     | 
    
         
            +
              module Utils
         
     | 
| 
      
 5 
     | 
    
         
            +
                module Grams
         
     | 
| 
      
 6 
     | 
    
         
            +
                  def create_grams(query, records_to_words)
         
     | 
| 
      
 7 
     | 
    
         
            +
                    all_grams = []
         
     | 
| 
      
 8 
     | 
    
         
            +
                    @query = query
         
     | 
| 
      
 9 
     | 
    
         
            +
                    @query_terms = query.split.map(&:downcase).map(&:strip)
         
     | 
| 
      
 10 
     | 
    
         
            +
                    @query_alt = "#{@query_terms[1..-1]} #{@query_terms[0]}"
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
                    invalid = Proc.new do |gram, *args|
         
     | 
| 
      
 13 
     | 
    
         
            +
                      # remove [[gram]] if == [[query]]
         
     | 
| 
      
 14 
     | 
    
         
            +
                      gram == @query || gram == @query_alt || @query_terms.include?(gram)
         
     | 
| 
      
 15 
     | 
    
         
            +
                    end
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
                    gram_collections = records_to_words.map do |record, words|
         
     | 
| 
      
 18 
     | 
    
         
            +
                      gram_collection = GramCollection.new(record, words, invalid)
         
     | 
| 
      
 19 
     | 
    
         
            +
                      all_grams += gram_collection.grams
         
     | 
| 
      
 20 
     | 
    
         
            +
                      gram_collection
         
     | 
| 
      
 21 
     | 
    
         
            +
                    end
         
     | 
| 
      
 22 
     | 
    
         
            +
                    return gram_collections, make_grams_unique(all_grams)
         
     | 
| 
      
 23 
     | 
    
         
            +
                  end
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
                  def check_plurals(frequent_grams)
         
     | 
| 
      
 26 
     | 
    
         
            +
                    # if exists [[gram]] and [[gram]]s then remove [[gram]]s
         
     | 
| 
      
 27 
     | 
    
         
            +
                    frequent_grams_contents = frequent_grams.map(&:content)
         
     | 
| 
      
 28 
     | 
    
         
            +
                    frequent_grams.delete_if do |gram|
         
     | 
| 
      
 29 
     | 
    
         
            +
                      gram.content[-1] == 's' and
         
     | 
| 
      
 30 
     | 
    
         
            +
                        frequent_grams_contents.include?(gram.content[0...-1])
         
     | 
| 
      
 31 
     | 
    
         
            +
                    end
         
     | 
| 
      
 32 
     | 
    
         
            +
                  end
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
                  def make_grams_unique(grams)
         
     | 
| 
      
 35 
     | 
    
         
            +
                    grams.reduce({}) do |hash, gram|
         
     | 
| 
      
 36 
     | 
    
         
            +
                      if hash[gram.content]
         
     | 
| 
      
 37 
     | 
    
         
            +
                        hash[gram.content].frequency += gram.frequency
         
     | 
| 
      
 38 
     | 
    
         
            +
                      else
         
     | 
| 
      
 39 
     | 
    
         
            +
                        hash[gram.content] = gram
         
     | 
| 
      
 40 
     | 
    
         
            +
                      end
         
     | 
| 
      
 41 
     | 
    
         
            +
                      hash
         
     | 
| 
      
 42 
     | 
    
         
            +
                    end.values
         
     | 
| 
      
 43 
     | 
    
         
            +
                  end
         
     | 
| 
      
 44 
     | 
    
         
            +
                end
         
     | 
| 
      
 45 
     | 
    
         
            +
              end
         
     | 
| 
      
 46 
     | 
    
         
            +
            end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,8 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: categorize
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.0. 
     | 
| 
       5 
     | 
    
         
            -
              prerelease: 
         
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.0.5
         
     | 
| 
       6 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       7 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       8 
7 
     | 
    
         
             
            - Peter Lubell-Doughtie
         
     | 
| 
         @@ -22,31 +21,34 @@ files: 
     | 
|
| 
       22 
21 
     | 
    
         
             
            - lib/categorize.rb
         
     | 
| 
       23 
22 
     | 
    
         
             
            - lib/categorize/model.rb
         
     | 
| 
       24 
23 
     | 
    
         
             
            - lib/categorize/constants.rb
         
     | 
| 
      
 24 
     | 
    
         
            +
            - lib/categorize/models/abstract_model.rb
         
     | 
| 
       25 
25 
     | 
    
         
             
            - lib/categorize/models/bag_of_words.rb
         
     | 
| 
       26 
26 
     | 
    
         
             
            - lib/categorize/models/cluster.rb
         
     | 
| 
       27 
27 
     | 
    
         
             
            - lib/categorize/models/hierarchical_cluster.rb
         
     | 
| 
      
 28 
     | 
    
         
            +
            - lib/categorize/utils/gram_collection.rb
         
     | 
| 
      
 29 
     | 
    
         
            +
            - lib/categorize/utils/gram_node.rb
         
     | 
| 
      
 30 
     | 
    
         
            +
            - lib/categorize/utils/grams.rb
         
     | 
| 
       28 
31 
     | 
    
         
             
            homepage: http://www.helioid.com/
         
     | 
| 
       29 
32 
     | 
    
         
             
            licenses: []
         
     | 
| 
      
 33 
     | 
    
         
            +
            metadata: {}
         
     | 
| 
       30 
34 
     | 
    
         
             
            post_install_message: 
         
     | 
| 
       31 
35 
     | 
    
         
             
            rdoc_options: []
         
     | 
| 
       32 
36 
     | 
    
         
             
            require_paths:
         
     | 
| 
       33 
37 
     | 
    
         
             
            - lib
         
     | 
| 
       34 
38 
     | 
    
         
             
            required_ruby_version: !ruby/object:Gem::Requirement
         
     | 
| 
       35 
     | 
    
         
            -
              none: false
         
     | 
| 
       36 
39 
     | 
    
         
             
              requirements:
         
     | 
| 
       37 
40 
     | 
    
         
             
              - - ! '>='
         
     | 
| 
       38 
41 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       39 
42 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       40 
43 
     | 
    
         
             
            required_rubygems_version: !ruby/object:Gem::Requirement
         
     | 
| 
       41 
     | 
    
         
            -
              none: false
         
     | 
| 
       42 
44 
     | 
    
         
             
              requirements:
         
     | 
| 
       43 
45 
     | 
    
         
             
              - - ! '>='
         
     | 
| 
       44 
46 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       45 
47 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       46 
48 
     | 
    
         
             
            requirements: []
         
     | 
| 
       47 
49 
     | 
    
         
             
            rubyforge_project: 
         
     | 
| 
       48 
     | 
    
         
            -
            rubygems_version:  
     | 
| 
      
 50 
     | 
    
         
            +
            rubygems_version: 2.0.3
         
     | 
| 
       49 
51 
     | 
    
         
             
            signing_key: 
         
     | 
| 
       50 
     | 
    
         
            -
            specification_version:  
     | 
| 
      
 52 
     | 
    
         
            +
            specification_version: 4
         
     | 
| 
       51 
53 
     | 
    
         
             
            summary: A text categorization library.
         
     | 
| 
       52 
54 
     | 
    
         
             
            test_files: []
         
     |