classifier 1.3.4 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
 - data/LICENSE +2 -2
 - data/lib/classifier/bayes.rb +132 -124
 - data/lib/classifier/extensions/string.rb +1 -1
 - data/lib/classifier/extensions/vector.rb +72 -78
 - data/lib/classifier/extensions/vector_serialize.rb +8 -10
 - data/lib/classifier/extensions/word_hash.rb +114 -120
 - data/lib/classifier/lsi/content_node.rb +39 -37
 - data/lib/classifier/lsi/summary.rb +24 -24
 - data/lib/classifier/lsi/word_list.rb +7 -8
 - data/lib/classifier/lsi.rb +174 -151
 - data/lib/classifier.rb +2 -1
 - data/test/test_helper.rb +3 -2
 - metadata +60 -27
 - data/Gemfile +0 -5
 - data/Gemfile.lock +0 -26
 - data/README.markdown +0 -97
 - data/Rakefile +0 -84
 - data/test/bayes/bayesian_test.rb +0 -33
 - data/test/extensions/word_hash_test.rb +0 -35
 - data/test/lsi/lsi_test.rb +0 -123
 
    
        data/lib/classifier/lsi.rb
    CHANGED
    
    | 
         @@ -3,54 +3,55 @@ 
     | 
|
| 
       3 
3 
     | 
    
         
             
            # License::   LGPL
         
     | 
| 
       4 
4 
     | 
    
         | 
| 
       5 
5 
     | 
    
         
             
            begin
         
     | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
       8 
     | 
    
         
            -
             
     | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
       10 
     | 
    
         
            -
             
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
      
 6 
     | 
    
         
            +
              # to test the native vector class, try `rake test NATIVE_VECTOR=true`
         
     | 
| 
      
 7 
     | 
    
         
            +
              raise LoadError if ENV['NATIVE_VECTOR'] == 'true'
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
              require 'gsl' # requires https://github.com/SciRuby/rb-gsl/
         
     | 
| 
      
 10 
     | 
    
         
            +
              require 'classifier/extensions/vector_serialize'
         
     | 
| 
      
 11 
     | 
    
         
            +
              $GSL = true
         
     | 
| 
       12 
12 
     | 
    
         
             
            rescue LoadError
         
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
      
 13 
     | 
    
         
            +
              warn 'Notice: for 10x faster LSI support, please install https://github.com/SciRuby/rb-gsl/'
         
     | 
| 
      
 14 
     | 
    
         
            +
              $GSL = false
         
     | 
| 
      
 15 
     | 
    
         
            +
              require 'classifier/extensions/vector'
         
     | 
| 
       15 
16 
     | 
    
         
             
            end
         
     | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
       17 
18 
     | 
    
         
             
            require 'classifier/lsi/word_list'
         
     | 
| 
       18 
19 
     | 
    
         
             
            require 'classifier/lsi/content_node'
         
     | 
| 
       19 
20 
     | 
    
         
             
            require 'classifier/lsi/summary'
         
     | 
| 
       20 
21 
     | 
    
         | 
| 
       21 
22 
     | 
    
         
             
            module Classifier
         
     | 
| 
       22 
     | 
    
         
            -
              
         
     | 
| 
       23 
23 
     | 
    
         
             
              # This class implements a Latent Semantic Indexer, which can search, classify and cluster
         
     | 
| 
       24 
24 
     | 
    
         
             
              # data based on underlying semantic relations. For more information on the algorithms used,
         
     | 
| 
       25 
25 
     | 
    
         
             
              # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
         
     | 
| 
       26 
26 
     | 
    
         
             
              class LSI
         
     | 
| 
       27 
     | 
    
         
            -
                
         
     | 
| 
       28 
27 
     | 
    
         
             
                attr_reader :word_list
         
     | 
| 
       29 
28 
     | 
    
         
             
                attr_accessor :auto_rebuild
         
     | 
| 
       30 
     | 
    
         
            -
             
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
       31 
30 
     | 
    
         
             
                # Create a fresh index.
         
     | 
| 
       32 
31 
     | 
    
         
             
                # If you want to call #build_index manually, use
         
     | 
| 
       33 
32 
     | 
    
         
             
                #      Classifier::LSI.new :auto_rebuild => false
         
     | 
| 
       34 
33 
     | 
    
         
             
                #
         
     | 
| 
       35 
34 
     | 
    
         
             
                def initialize(options = {})
         
     | 
| 
       36 
35 
     | 
    
         
             
                  @auto_rebuild = true unless options[:auto_rebuild] == false
         
     | 
| 
       37 
     | 
    
         
            -
                  @word_list 
     | 
| 
       38 
     | 
    
         
            -
                  @ 
     | 
| 
      
 36 
     | 
    
         
            +
                  @word_list = WordList.new
         
     | 
| 
      
 37 
     | 
    
         
            +
                  @items = {}
         
     | 
| 
      
 38 
     | 
    
         
            +
                  @version = 0
         
     | 
| 
      
 39 
     | 
    
         
            +
                  @built_at_version = -1
         
     | 
| 
       39 
40 
     | 
    
         
             
                end
         
     | 
| 
       40 
     | 
    
         
            -
             
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
       41 
42 
     | 
    
         
             
                # Returns true if the index needs to be rebuilt.  The index needs
         
     | 
| 
       42 
43 
     | 
    
         
             
                # to be built after all informaton is added, but before you start
         
     | 
| 
       43 
44 
     | 
    
         
             
                # using it for search, classification and cluster detection.
         
     | 
| 
       44 
45 
     | 
    
         
             
                def needs_rebuild?
         
     | 
| 
       45 
46 
     | 
    
         
             
                  (@items.keys.size > 1) && (@version != @built_at_version)
         
     | 
| 
       46 
47 
     | 
    
         
             
                end
         
     | 
| 
       47 
     | 
    
         
            -
             
     | 
| 
       48 
     | 
    
         
            -
                # Adds an item to the index. item is assumed to be a string, but 
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
                # Adds an item to the index. item is assumed to be a string, but
         
     | 
| 
       49 
50 
     | 
    
         
             
                # any item may be indexed so long as it responds to #to_s or if
         
     | 
| 
       50 
     | 
    
         
            -
                # you provide an optional block explaining how the indexer can 
     | 
| 
      
 51 
     | 
    
         
            +
                # you provide an optional block explaining how the indexer can
         
     | 
| 
       51 
52 
     | 
    
         
             
                # fetch fresh string data. This optional block is passed the item,
         
     | 
| 
       52 
53 
     | 
    
         
             
                # so the item may only be a reference to a URL or file name.
         
     | 
| 
       53 
     | 
    
         
            -
                # 
     | 
| 
      
 54 
     | 
    
         
            +
                #
         
     | 
| 
       54 
55 
     | 
    
         
             
                # For example:
         
     | 
| 
       55 
56 
     | 
    
         
             
                #   lsi = Classifier::LSI.new
         
     | 
| 
       56 
57 
     | 
    
         
             
                #   lsi.add_item "This is just plain text"
         
     | 
| 
         @@ -58,226 +59,252 @@ module Classifier 
     | 
|
| 
       58 
59 
     | 
    
         
             
                #   ar = ActiveRecordObject.find( :all )
         
     | 
| 
       59 
60 
     | 
    
         
             
                #   lsi.add_item ar, *ar.categories { |x| ar.content }
         
     | 
| 
       60 
61 
     | 
    
         
             
                #
         
     | 
| 
       61 
     | 
    
         
            -
                def add_item( 
     | 
| 
      
 62 
     | 
    
         
            +
                def add_item(item, *categories, &block)
         
     | 
| 
       62 
63 
     | 
    
         
             
                  clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
         
     | 
| 
       63 
64 
     | 
    
         
             
                  @items[item] = ContentNode.new(clean_word_hash, *categories)
         
     | 
| 
       64 
65 
     | 
    
         
             
                  @version += 1
         
     | 
| 
       65 
66 
     | 
    
         
             
                  build_index if @auto_rebuild
         
     | 
| 
       66 
67 
     | 
    
         
             
                end
         
     | 
| 
       67 
68 
     | 
    
         | 
| 
       68 
     | 
    
         
            -
                # A less flexible shorthand for add_item that assumes 
     | 
| 
      
 69 
     | 
    
         
            +
                # A less flexible shorthand for add_item that assumes
         
     | 
| 
       69 
70 
     | 
    
         
             
                # you are passing in a string with no categorries. item
         
     | 
| 
       70 
     | 
    
         
            -
                # will be duck typed via to_s . 
     | 
| 
      
 71 
     | 
    
         
            +
                # will be duck typed via to_s .
         
     | 
| 
       71 
72 
     | 
    
         
             
                #
         
     | 
| 
       72 
     | 
    
         
            -
                def <<( 
     | 
| 
       73 
     | 
    
         
            -
                  add_item 
     | 
| 
      
 73 
     | 
    
         
            +
                def <<(item)
         
     | 
| 
      
 74 
     | 
    
         
            +
                  add_item(item)
         
     | 
| 
       74 
75 
     | 
    
         
             
                end
         
     | 
| 
       75 
     | 
    
         
            -
             
     | 
| 
      
 76 
     | 
    
         
            +
             
     | 
| 
       76 
77 
     | 
    
         
             
                # Returns the categories for a given indexed items. You are free to add and remove
         
     | 
| 
       77 
78 
     | 
    
         
             
                # items from this as you see fit. It does not invalide an index to change its categories.
         
     | 
| 
       78 
79 
     | 
    
         
             
                def categories_for(item)
         
     | 
| 
       79 
80 
     | 
    
         
             
                  return [] unless @items[item]
         
     | 
| 
       80 
     | 
    
         
            -
             
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
                  @items[item].categories
         
     | 
| 
       81 
83 
     | 
    
         
             
                end
         
     | 
| 
       82 
84 
     | 
    
         | 
| 
       83 
     | 
    
         
            -
                # Removes an item from the database, if it is indexed. 
     | 
| 
      
 85 
     | 
    
         
            +
                # Removes an item from the database, if it is indexed.
         
     | 
| 
       84 
86 
     | 
    
         
             
                #
         
     | 
| 
       85 
     | 
    
         
            -
                def remove_item( 
     | 
| 
       86 
     | 
    
         
            -
                   
     | 
| 
       87 
     | 
    
         
            -
             
     | 
| 
       88 
     | 
    
         
            -
             
     | 
| 
       89 
     | 
    
         
            -
                   
     | 
| 
      
 87 
     | 
    
         
            +
                def remove_item(item)
         
     | 
| 
      
 88 
     | 
    
         
            +
                  return unless @items.key?(item)
         
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
      
 90 
     | 
    
         
            +
                  @items.delete(item)
         
     | 
| 
      
 91 
     | 
    
         
            +
                  @version += 1
         
     | 
| 
       90 
92 
     | 
    
         
             
                end
         
     | 
| 
       91 
     | 
    
         
            -
             
     | 
| 
       92 
     | 
    
         
            -
                # Returns an array of items that are indexed. 
     | 
| 
      
 93 
     | 
    
         
            +
             
     | 
| 
      
 94 
     | 
    
         
            +
                # Returns an array of items that are indexed.
         
     | 
| 
       93 
95 
     | 
    
         
             
                def items
         
     | 
| 
       94 
96 
     | 
    
         
             
                  @items.keys
         
     | 
| 
       95 
97 
     | 
    
         
             
                end
         
     | 
| 
       96 
     | 
    
         
            -
                
         
     | 
| 
       97 
     | 
    
         
            -
                # Returns the categories for a given indexed items. You are free to add and remove
         
     | 
| 
       98 
     | 
    
         
            -
                # items from this as you see fit. It does not invalide an index to change its categories.
         
     | 
| 
       99 
     | 
    
         
            -
                def categories_for(item)
         
     | 
| 
       100 
     | 
    
         
            -
                  return [] unless @items[item]
         
     | 
| 
       101 
     | 
    
         
            -
                  return @items[item].categories
         
     | 
| 
       102 
     | 
    
         
            -
                end
         
     | 
| 
       103 
98 
     | 
    
         | 
| 
       104 
99 
     | 
    
         
             
                # This function rebuilds the index if needs_rebuild? returns true.
         
     | 
| 
       105 
100 
     | 
    
         
             
                # For very large document spaces, this indexing operation may take some
         
     | 
| 
       106 
     | 
    
         
            -
                # time to complete, so it may be wise to place the operation in another 
     | 
| 
       107 
     | 
    
         
            -
                # thread. 
     | 
| 
      
 101 
     | 
    
         
            +
                # time to complete, so it may be wise to place the operation in another
         
     | 
| 
      
 102 
     | 
    
         
            +
                # thread.
         
     | 
| 
       108 
103 
     | 
    
         
             
                #
         
     | 
| 
       109 
104 
     | 
    
         
             
                # As a rule, indexing will be fairly swift on modern machines until
         
     | 
| 
       110 
     | 
    
         
            -
                # you have well over 500 documents indexed, or have an incredibly diverse 
     | 
| 
       111 
     | 
    
         
            -
                # vocabulary for your documents. 
     | 
| 
      
 105 
     | 
    
         
            +
                # you have well over 500 documents indexed, or have an incredibly diverse
         
     | 
| 
      
 106 
     | 
    
         
            +
                # vocabulary for your documents.
         
     | 
| 
       112 
107 
     | 
    
         
             
                #
         
     | 
| 
       113 
108 
     | 
    
         
             
                # The optional parameter "cutoff" is a tuning parameter. When the index is
         
     | 
| 
       114 
     | 
    
         
            -
                # built, a certain number of s-values are discarded from the system. The 
     | 
| 
      
 109 
     | 
    
         
            +
                # built, a certain number of s-values are discarded from the system. The
         
     | 
| 
       115 
110 
     | 
    
         
             
                # cutoff parameter tells the indexer how many of these values to keep.
         
     | 
| 
       116 
111 
     | 
    
         
             
                # A value of 1 for cutoff means that no semantic analysis will take place,
         
     | 
| 
       117 
112 
     | 
    
         
             
                # turning the LSI class into a simple vector search engine.
         
     | 
| 
       118 
     | 
    
         
            -
                def build_index(  
     | 
| 
      
 113 
     | 
    
         
            +
                def build_index(cutoff = 0.75)
         
     | 
| 
       119 
114 
     | 
    
         
             
                  return unless needs_rebuild?
         
     | 
| 
      
 115 
     | 
    
         
            +
             
     | 
| 
       120 
116 
     | 
    
         
             
                  make_word_list
         
     | 
| 
       121 
     | 
    
         
            -
             
     | 
| 
      
 117 
     | 
    
         
            +
             
     | 
| 
       122 
118 
     | 
    
         
             
                  doc_list = @items.values
         
     | 
| 
       123 
     | 
    
         
            -
                  tda = doc_list.collect { |node| node.raw_vector_with( 
     | 
| 
       124 
     | 
    
         
            -
             
     | 
| 
      
 119 
     | 
    
         
            +
                  tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
         
     | 
| 
      
 120 
     | 
    
         
            +
             
     | 
| 
       125 
121 
     | 
    
         
             
                  if $GSL
         
     | 
| 
       126 
     | 
    
         
            -
             
     | 
| 
       127 
     | 
    
         
            -
             
     | 
| 
       128 
     | 
    
         
            -
             
     | 
| 
       129 
     | 
    
         
            -
             
     | 
| 
       130 
     | 
    
         
            -
             
     | 
| 
       131 
     | 
    
         
            -
             
     | 
| 
       132 
     | 
    
         
            -
             
     | 
| 
       133 
     | 
    
         
            -
             
     | 
| 
      
 122 
     | 
    
         
            +
                    tdm = GSL::Matrix.alloc(*tda).trans
         
     | 
| 
      
 123 
     | 
    
         
            +
                    ntdm = build_reduced_matrix(tdm, cutoff)
         
     | 
| 
      
 124 
     | 
    
         
            +
             
     | 
| 
      
 125 
     | 
    
         
            +
                    ntdm.size[1].times do |col|
         
     | 
| 
      
 126 
     | 
    
         
            +
                      vec = GSL::Vector.alloc(ntdm.column(col)).row
         
     | 
| 
      
 127 
     | 
    
         
            +
                      doc_list[col].lsi_vector = vec
         
     | 
| 
      
 128 
     | 
    
         
            +
                      doc_list[col].lsi_norm = vec.normalize
         
     | 
| 
      
 129 
     | 
    
         
            +
                    end
         
     | 
| 
       134 
130 
     | 
    
         
             
                  else
         
     | 
| 
       135 
     | 
    
         
            -
             
     | 
| 
       136 
     | 
    
         
            -
             
     | 
| 
       137 
     | 
    
         
            -
             
     | 
| 
       138 
     | 
    
         
            -
             
     | 
| 
       139 
     | 
    
         
            -
             
     | 
| 
       140 
     | 
    
         
            -
             
     | 
| 
       141 
     | 
    
         
            -
             
     | 
| 
      
 131 
     | 
    
         
            +
                    tdm = Matrix.rows(tda).trans
         
     | 
| 
      
 132 
     | 
    
         
            +
                    ntdm = build_reduced_matrix(tdm, cutoff)
         
     | 
| 
      
 133 
     | 
    
         
            +
             
     | 
| 
      
 134 
     | 
    
         
            +
                    ntdm.row_size.times do |col|
         
     | 
| 
      
 135 
     | 
    
         
            +
                      doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
         
     | 
| 
      
 136 
     | 
    
         
            +
                      doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
         
     | 
| 
      
 137 
     | 
    
         
            +
                    end
         
     | 
| 
       142 
138 
     | 
    
         
             
                  end
         
     | 
| 
       143 
     | 
    
         
            -
             
     | 
| 
      
 139 
     | 
    
         
            +
             
     | 
| 
       144 
140 
     | 
    
         
             
                  @built_at_version = @version
         
     | 
| 
       145 
141 
     | 
    
         
             
                end
         
     | 
| 
       146 
     | 
    
         
            -
             
     | 
| 
      
 142 
     | 
    
         
            +
             
     | 
| 
       147 
143 
     | 
    
         
             
                # This method returns max_chunks entries, ordered by their average semantic rating.
         
     | 
| 
       148 
144 
     | 
    
         
             
                # Essentially, the average distance of each entry from all other entries is calculated,
         
     | 
| 
       149 
145 
     | 
    
         
             
                # the highest are returned.
         
     | 
| 
       150 
146 
     | 
    
         
             
                #
         
     | 
| 
       151 
147 
     | 
    
         
             
                # This can be used to build a summary service, or to provide more information about
         
     | 
| 
       152 
148 
     | 
    
         
             
                # your dataset's general content. For example, if you were to use categorize on the
         
     | 
| 
       153 
     | 
    
         
            -
                # results of this data, you could gather information on what your dataset is generally 
     | 
| 
      
 149 
     | 
    
         
            +
                # results of this data, you could gather information on what your dataset is generally
         
     | 
| 
       154 
150 
     | 
    
         
             
                # about.
         
     | 
| 
       155 
     | 
    
         
            -
                def highest_relative_content(  
     | 
| 
       156 
     | 
    
         
            -
             
     | 
| 
       157 
     | 
    
         
            -
             
     | 
| 
       158 
     | 
    
         
            -
             
     | 
| 
       159 
     | 
    
         
            -
             
     | 
| 
       160 
     | 
    
         
            -
             
     | 
| 
       161 
     | 
    
         
            -
             
     | 
| 
      
 151 
     | 
    
         
            +
                def highest_relative_content(max_chunks = 10)
         
     | 
| 
      
 152 
     | 
    
         
            +
                  return [] if needs_rebuild?
         
     | 
| 
      
 153 
     | 
    
         
            +
             
     | 
| 
      
 154 
     | 
    
         
            +
                  avg_density = {}
         
     | 
| 
      
 155 
     | 
    
         
            +
                  @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x, y| x + y[1] } }
         
     | 
| 
      
 156 
     | 
    
         
            +
             
     | 
| 
      
 157 
     | 
    
         
            +
                  avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks - 1].map
         
     | 
| 
       162 
158 
     | 
    
         
             
                end
         
     | 
| 
       163 
159 
     | 
    
         | 
| 
       164 
     | 
    
         
            -
                # This function is the primitive that find_related and classify 
     | 
| 
      
 160 
     | 
    
         
            +
                # This function is the primitive that find_related and classify
         
     | 
| 
       165 
161 
     | 
    
         
             
                # build upon. It returns an array of 2-element arrays. The first element
         
     | 
| 
       166 
162 
     | 
    
         
             
                # of this array is a document, and the second is its "score", defining
         
     | 
| 
       167 
163 
     | 
    
         
             
                # how "close" it is to other indexed items.
         
     | 
| 
       168 
     | 
    
         
            -
                # 
     | 
| 
      
 164 
     | 
    
         
            +
                #
         
     | 
| 
       169 
165 
     | 
    
         
             
                # These values are somewhat arbitrary, having to do with the vector space
         
     | 
| 
       170 
166 
     | 
    
         
             
                # created by your content, so the magnitude is interpretable but not always
         
     | 
| 
       171 
     | 
    
         
            -
                # meaningful between indexes. 
     | 
| 
      
 167 
     | 
    
         
            +
                # meaningful between indexes.
         
     | 
| 
       172 
168 
     | 
    
         
             
                #
         
     | 
| 
       173 
169 
     | 
    
         
             
                # The parameter doc is the content to compare. If that content is not
         
     | 
| 
       174 
     | 
    
         
            -
                # indexed, you can pass an optional block to define how to create the 
     | 
| 
       175 
     | 
    
         
            -
                # text data. See add_item for examples of how this works. 
     | 
| 
       176 
     | 
    
         
            -
                def proximity_array_for_content( 
     | 
| 
      
 170 
     | 
    
         
            +
                # indexed, you can pass an optional block to define how to create the
         
     | 
| 
      
 171 
     | 
    
         
            +
                # text data. See add_item for examples of how this works.
         
     | 
| 
      
 172 
     | 
    
         
            +
                def proximity_array_for_content(doc, &block)
         
     | 
| 
       177 
173 
     | 
    
         
             
                  return [] if needs_rebuild?
         
     | 
| 
       178 
     | 
    
         
            -
             
     | 
| 
       179 
     | 
    
         
            -
                  content_node = node_for_content( 
     | 
| 
       180 
     | 
    
         
            -
                  result = 
     | 
| 
      
 174 
     | 
    
         
            +
             
     | 
| 
      
 175 
     | 
    
         
            +
                  content_node = node_for_content(doc, &block)
         
     | 
| 
      
 176 
     | 
    
         
            +
                  result =
         
     | 
| 
       181 
177 
     | 
    
         
             
                    @items.keys.collect do |item|
         
     | 
| 
       182 
     | 
    
         
            -
                      if $GSL
         
     | 
| 
       183 
     | 
    
         
            -
             
     | 
| 
       184 
     | 
    
         
            -
             
     | 
| 
       185 
     | 
    
         
            -
             
     | 
| 
       186 
     | 
    
         
            -
             
     | 
| 
      
 178 
     | 
    
         
            +
                      val = if $GSL
         
     | 
| 
      
 179 
     | 
    
         
            +
                              content_node.search_vector * @items[item].search_vector.col
         
     | 
| 
      
 180 
     | 
    
         
            +
                            else
         
     | 
| 
      
 181 
     | 
    
         
            +
                              (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
         
     | 
| 
      
 182 
     | 
    
         
            +
                            end
         
     | 
| 
       187 
183 
     | 
    
         
             
                      [item, val]
         
     | 
| 
       188 
184 
     | 
    
         
             
                    end
         
     | 
| 
       189 
185 
     | 
    
         
             
                  result.sort_by { |x| x[1] }.reverse
         
     | 
| 
       190 
     | 
    
         
            -
                end 
     | 
| 
       191 
     | 
    
         
            -
             
     | 
| 
      
 186 
     | 
    
         
            +
                end
         
     | 
| 
      
 187 
     | 
    
         
            +
             
     | 
| 
       192 
188 
     | 
    
         
             
                # Similar to proximity_array_for_content, this function takes similar
         
     | 
| 
       193 
189 
     | 
    
         
             
                # arguments and returns a similar array. However, it uses the normalized
         
     | 
| 
       194 
     | 
    
         
            -
                # calculated vectors instead of their full versions. This is useful when 
     | 
| 
      
 190 
     | 
    
         
            +
                # calculated vectors instead of their full versions. This is useful when
         
     | 
| 
       195 
191 
     | 
    
         
             
                # you're trying to perform operations on content that is much smaller than
         
     | 
| 
       196 
192 
     | 
    
         
             
                # the text you're working with. search uses this primitive.
         
     | 
| 
       197 
     | 
    
         
            -
                def proximity_norms_for_content( 
     | 
| 
      
 193 
     | 
    
         
            +
                def proximity_norms_for_content(doc, &block)
         
     | 
| 
       198 
194 
     | 
    
         
             
                  return [] if needs_rebuild?
         
     | 
| 
       199 
     | 
    
         
            -
             
     | 
| 
       200 
     | 
    
         
            -
                  content_node = node_for_content( 
     | 
| 
       201 
     | 
    
         
            -
                  result = 
     | 
| 
      
 195 
     | 
    
         
            +
             
     | 
| 
      
 196 
     | 
    
         
            +
                  content_node = node_for_content(doc, &block)
         
     | 
| 
      
 197 
     | 
    
         
            +
                  result =
         
     | 
| 
       202 
198 
     | 
    
         
             
                    @items.keys.collect do |item|
         
     | 
| 
       203 
     | 
    
         
            -
                      if $GSL
         
     | 
| 
       204 
     | 
    
         
            -
             
     | 
| 
       205 
     | 
    
         
            -
             
     | 
| 
       206 
     | 
    
         
            -
             
     | 
| 
       207 
     | 
    
         
            -
             
     | 
| 
      
 199 
     | 
    
         
            +
                      val = if $GSL
         
     | 
| 
      
 200 
     | 
    
         
            +
                              content_node.search_norm * @items[item].search_norm.col
         
     | 
| 
      
 201 
     | 
    
         
            +
                            else
         
     | 
| 
      
 202 
     | 
    
         
            +
                              (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
         
     | 
| 
      
 203 
     | 
    
         
            +
                            end
         
     | 
| 
       208 
204 
     | 
    
         
             
                      [item, val]
         
     | 
| 
       209 
205 
     | 
    
         
             
                    end
         
     | 
| 
       210 
206 
     | 
    
         
             
                  result.sort_by { |x| x[1] }.reverse
         
     | 
| 
       211 
     | 
    
         
            -
                end 
     | 
| 
       212 
     | 
    
         
            -
             
     | 
| 
      
 207 
     | 
    
         
            +
                end
         
     | 
| 
      
 208 
     | 
    
         
            +
             
     | 
| 
       213 
209 
     | 
    
         
             
                # This function allows for text-based search of your index. Unlike other functions
         
     | 
| 
       214 
210 
     | 
    
         
             
                # like find_related and classify, search only takes short strings. It will also ignore
         
     | 
| 
       215 
     | 
    
         
            -
                # factors like repeated words. It is best for short, google-like search terms. 
     | 
| 
       216 
     | 
    
         
            -
                # A search will first priortize lexical relationships, then semantic ones. 
     | 
| 
      
 211 
     | 
    
         
            +
                # factors like repeated words. It is best for short, google-like search terms.
         
     | 
| 
      
 212 
     | 
    
         
            +
                # A search will first priortize lexical relationships, then semantic ones.
         
     | 
| 
       217 
213 
     | 
    
         
             
                #
         
     | 
| 
       218 
214 
     | 
    
         
             
                # While this may seem backwards compared to the other functions that LSI supports,
         
     | 
| 
       219 
215 
     | 
    
         
             
                # it is actually the same algorithm, just applied on a smaller document.
         
     | 
| 
       220 
     | 
    
         
            -
                def search( 
     | 
| 
      
 216 
     | 
    
         
            +
                def search(string, max_nearest = 3)
         
     | 
| 
       221 
217 
     | 
    
         
             
                  return [] if needs_rebuild?
         
     | 
| 
       222 
     | 
    
         
            -
             
     | 
| 
      
 218 
     | 
    
         
            +
             
     | 
| 
      
 219 
     | 
    
         
            +
                  carry = proximity_norms_for_content(string)
         
     | 
| 
       223 
220 
     | 
    
         
             
                  result = carry.collect { |x| x[0] }
         
     | 
| 
       224 
     | 
    
         
            -
                   
     | 
| 
      
 221 
     | 
    
         
            +
                  result[0..max_nearest - 1]
         
     | 
| 
       225 
222 
     | 
    
         
             
                end
         
     | 
| 
       226 
     | 
    
         
            -
             
     | 
| 
      
 223 
     | 
    
         
            +
             
     | 
| 
       227 
224 
     | 
    
         
             
                # This function takes content and finds other documents
         
     | 
| 
       228 
225 
     | 
    
         
             
                # that are semantically "close", returning an array of documents sorted
         
     | 
| 
       229 
226 
     | 
    
         
             
                # from most to least relavant.
         
     | 
| 
       230 
     | 
    
         
            -
                # max_nearest specifies the number of documents to return. A value of 
     | 
| 
       231 
     | 
    
         
            -
                # 0 means that it returns all the indexed documents, sorted by relavence. 
     | 
| 
      
 227 
     | 
    
         
            +
                # max_nearest specifies the number of documents to return. A value of
         
     | 
| 
      
 228 
     | 
    
         
            +
                # 0 means that it returns all the indexed documents, sorted by relavence.
         
     | 
| 
       232 
229 
     | 
    
         
             
                #
         
     | 
| 
       233 
     | 
    
         
            -
                # This is particularly useful for identifing clusters in your document space. 
     | 
| 
      
 230 
     | 
    
         
            +
                # This is particularly useful for identifing clusters in your document space.
         
     | 
| 
       234 
231 
     | 
    
         
             
                # For example you may want to identify several "What's Related" items for weblog
         
     | 
| 
       235 
232 
     | 
    
         
             
                # articles, or find paragraphs that relate to each other in an essay.
         
     | 
| 
       236 
     | 
    
         
            -
                def find_related( 
     | 
| 
       237 
     | 
    
         
            -
                  carry = 
     | 
| 
       238 
     | 
    
         
            -
                    proximity_array_for_content( 
     | 
| 
      
 233 
     | 
    
         
            +
                def find_related(doc, max_nearest = 3, &block)
         
     | 
| 
      
 234 
     | 
    
         
            +
                  carry =
         
     | 
| 
      
 235 
     | 
    
         
            +
                    proximity_array_for_content(doc, &block).reject { |pair| pair[0] == doc }
         
     | 
| 
       239 
236 
     | 
    
         
             
                  result = carry.collect { |x| x[0] }
         
     | 
| 
       240 
     | 
    
         
            -
                   
     | 
| 
      
 237 
     | 
    
         
            +
                  result[0..max_nearest - 1]
         
     | 
| 
       241 
238 
     | 
    
         
             
                end
         
     | 
| 
       242 
     | 
    
         
            -
             
     | 
| 
       243 
     | 
    
         
            -
                # This function uses a voting system to categorize documents, based on 
     | 
| 
       244 
     | 
    
         
            -
                # the categories of other documents. It uses the same logic as the 
     | 
| 
      
 239 
     | 
    
         
            +
             
     | 
| 
      
 240 
     | 
    
         
            +
                # This function uses a voting system to categorize documents, based on
         
     | 
| 
      
 241 
     | 
    
         
            +
                # the categories of other documents. It uses the same logic as the
         
     | 
| 
       245 
242 
     | 
    
         
             
                # find_related function to find related documents, then returns the
         
     | 
| 
       246 
     | 
    
         
            -
                # most obvious category from this list. 
     | 
| 
      
 243 
     | 
    
         
            +
                # most obvious category from this list.
         
     | 
| 
       247 
244 
     | 
    
         
             
                #
         
     | 
| 
       248 
     | 
    
         
            -
                # cutoff signifies the number of documents to consider when clasifying 
     | 
| 
       249 
     | 
    
         
            -
                # text. A cutoff of 1 means that every document in the index votes on 
     | 
| 
      
 245 
     | 
    
         
            +
                # cutoff signifies the number of documents to consider when clasifying
         
     | 
| 
      
 246 
     | 
    
         
            +
                # text. A cutoff of 1 means that every document in the index votes on
         
     | 
| 
       250 
247 
     | 
    
         
             
                # what category the document is in. This may not always make sense.
         
     | 
| 
       251 
248 
     | 
    
         
             
                #
         
     | 
| 
       252 
     | 
    
         
            -
                def classify( 
     | 
| 
      
 249 
     | 
    
         
            +
                def classify(doc, cutoff = 0.30, &block)
         
     | 
| 
      
 250 
     | 
    
         
            +
                  votes = vote(doc, cutoff, &block)
         
     | 
| 
      
 251 
     | 
    
         
            +
             
     | 
| 
      
 252 
     | 
    
         
            +
                  ranking = votes.keys.sort_by { |x| votes[x] }
         
     | 
| 
      
 253 
     | 
    
         
            +
                  ranking[-1]
         
     | 
| 
      
 254 
     | 
    
         
            +
                end
         
     | 
| 
      
 255 
     | 
    
         
            +
             
     | 
| 
      
 256 
     | 
    
         
            +
                def vote(doc, cutoff = 0.30, &block)
         
     | 
| 
       253 
257 
     | 
    
         
             
                  icutoff = (@items.size * cutoff).round
         
     | 
| 
       254 
     | 
    
         
            -
                  carry = proximity_array_for_content( 
     | 
| 
       255 
     | 
    
         
            -
                  carry = carry[0..icutoff-1]
         
     | 
| 
      
 258 
     | 
    
         
            +
                  carry = proximity_array_for_content(doc, &block)
         
     | 
| 
      
 259 
     | 
    
         
            +
                  carry = carry[0..icutoff - 1]
         
     | 
| 
       256 
260 
     | 
    
         
             
                  votes = {}
         
     | 
| 
       257 
261 
     | 
    
         
             
                  carry.each do |pair|
         
     | 
| 
       258 
262 
     | 
    
         
             
                    categories = @items[pair[0]].categories
         
     | 
| 
       259 
     | 
    
         
            -
                    categories.each do |category| 
     | 
| 
      
 263 
     | 
    
         
            +
                    categories.each do |category|
         
     | 
| 
       260 
264 
     | 
    
         
             
                      votes[category] ||= 0.0
         
     | 
| 
       261 
     | 
    
         
            -
                      votes[category] += pair[1] 
     | 
| 
      
 265 
     | 
    
         
            +
                      votes[category] += pair[1]
         
     | 
| 
       262 
266 
     | 
    
         
             
                    end
         
     | 
| 
       263 
267 
     | 
    
         
             
                  end
         
     | 
| 
       264 
     | 
    
         
            -
                  
         
     | 
| 
      
 268 
     | 
    
         
            +
                  votes
         
     | 
| 
      
 269 
     | 
    
         
            +
                end
         
     | 
| 
      
 270 
     | 
    
         
            +
             
     | 
| 
      
 271 
     | 
    
         
            +
                # Returns the same category as classify() but also returns
         
     | 
| 
      
 272 
     | 
    
         
            +
                # a confidence value derived from the vote share that the
         
     | 
| 
      
 273 
     | 
    
         
            +
                # winning category got.
         
     | 
| 
      
 274 
     | 
    
         
            +
                #
         
     | 
| 
      
 275 
     | 
    
         
            +
                # e.g.
         
     | 
| 
      
 276 
     | 
    
         
            +
                # category,confidence = classify_with_confidence(doc)
         
     | 
| 
      
 277 
     | 
    
         
            +
                # if confidence < 0.3
         
     | 
| 
      
 278 
     | 
    
         
            +
                #   category = nil
         
     | 
| 
      
 279 
     | 
    
         
            +
                # end
         
     | 
| 
      
 280 
     | 
    
         
            +
                #
         
     | 
| 
      
 281 
     | 
    
         
            +
                #
         
     | 
| 
      
 282 
     | 
    
         
            +
                # See classify() for argument docs
         
     | 
| 
      
 283 
     | 
    
         
            +
                def classify_with_confidence(doc, cutoff = 0.30, &block)
         
     | 
| 
      
 284 
     | 
    
         
            +
                  votes = vote(doc, cutoff, &block)
         
     | 
| 
      
 285 
     | 
    
         
            +
                  votes_sum = votes.values.inject(0.0) { |sum, v| sum + v }
         
     | 
| 
      
 286 
     | 
    
         
            +
                  return [nil, nil] if votes_sum.zero?
         
     | 
| 
      
 287 
     | 
    
         
            +
             
     | 
| 
       265 
288 
     | 
    
         
             
                  ranking = votes.keys.sort_by { |x| votes[x] }
         
     | 
| 
       266 
     | 
    
         
            -
                   
     | 
| 
      
 289 
     | 
    
         
            +
                  winner = ranking[-1]
         
     | 
| 
      
 290 
     | 
    
         
            +
                  vote_share = votes[winner] / votes_sum.to_f
         
     | 
| 
      
 291 
     | 
    
         
            +
                  [winner, vote_share]
         
     | 
| 
       267 
292 
     | 
    
         
             
                end
         
     | 
| 
       268 
     | 
    
         
            -
             
     | 
| 
      
 293 
     | 
    
         
            +
             
     | 
| 
       269 
294 
     | 
    
         
             
                # Prototype, only works on indexed documents.
         
     | 
| 
       270 
295 
     | 
    
         
             
                # I have no clue if this is going to work, but in theory
         
     | 
| 
       271 
296 
     | 
    
         
             
                # it's supposed to.
         
     | 
| 
       272 
     | 
    
         
            -
                def highest_ranked_stems( 
     | 
| 
       273 
     | 
    
         
            -
                  raise  
     | 
| 
      
 297 
     | 
    
         
            +
                def highest_ranked_stems(doc, count = 3)
         
     | 
| 
      
 298 
     | 
    
         
            +
                  raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
         
     | 
| 
      
 299 
     | 
    
         
            +
             
     | 
| 
       274 
300 
     | 
    
         
             
                  arr = node_for_content(doc).lsi_vector.to_a
         
     | 
| 
       275 
     | 
    
         
            -
                  top_n = arr.sort.reverse[0..count-1]
         
     | 
| 
       276 
     | 
    
         
            -
                   
     | 
| 
      
 301 
     | 
    
         
            +
                  top_n = arr.sort.reverse[0..count - 1]
         
     | 
| 
      
 302 
     | 
    
         
            +
                  top_n.collect { |x| @word_list.word_for_index(arr.index(x)) }
         
     | 
| 
       277 
303 
     | 
    
         
             
                end
         
     | 
| 
       278 
304 
     | 
    
         | 
| 
       279 
305 
     | 
    
         
             
                private
         
     | 
| 
       280 
     | 
    
         
            -
             
     | 
| 
      
 306 
     | 
    
         
            +
             
     | 
| 
      
 307 
     | 
    
         
            +
                def build_reduced_matrix(matrix, cutoff = 0.75)
         
     | 
| 
       281 
308 
     | 
    
         
             
                  # TODO: Check that M>=N on these dimensions! Transpose helps assure this
         
     | 
| 
       282 
309 
     | 
    
         
             
                  u, v, s = matrix.SV_decomp
         
     | 
| 
       283 
310 
     | 
    
         | 
| 
         @@ -287,32 +314,28 @@ module Classifier 
     | 
|
| 
       287 
314 
     | 
    
         
             
                    s[ord] = 0.0 if s[ord] < s_cutoff
         
     | 
| 
       288 
315 
     | 
    
         
             
                  end
         
     | 
| 
       289 
316 
     | 
    
         
             
                  # Reconstruct the term document matrix, only with reduced rank
         
     | 
| 
       290 
     | 
    
         
            -
                  u * ($GSL ? GSL::Matrix : ::Matrix).diag( 
     | 
| 
      
 317 
     | 
    
         
            +
                  u * ($GSL ? GSL::Matrix : ::Matrix).diag(s) * v.trans
         
     | 
| 
       291 
318 
     | 
    
         
             
                end
         
     | 
| 
       292 
     | 
    
         
            -
                
         
     | 
| 
       293 
     | 
    
         
            -
                def node_for_content(item, &block)    
         
     | 
| 
       294 
     | 
    
         
            -
                  if @items[item]
         
     | 
| 
       295 
     | 
    
         
            -
                    return @items[item]
         
     | 
| 
       296 
     | 
    
         
            -
                  else
         
     | 
| 
       297 
     | 
    
         
            -
                    clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
         
     | 
| 
       298 
319 
     | 
    
         | 
| 
       299 
     | 
    
         
            -
             
     | 
| 
      
 320 
     | 
    
         
            +
                def node_for_content(item, &block)
         
     | 
| 
      
 321 
     | 
    
         
            +
                  return @items[item] if @items[item]
         
     | 
| 
       300 
322 
     | 
    
         | 
| 
       301 
     | 
    
         
            -
             
     | 
| 
       302 
     | 
    
         
            -
             
     | 
| 
       303 
     | 
    
         
            -
             
     | 
| 
      
 323 
     | 
    
         
            +
                  clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
         
     | 
| 
      
 324 
     | 
    
         
            +
             
     | 
| 
      
 325 
     | 
    
         
            +
                  cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
         
     | 
| 
      
 326 
     | 
    
         
            +
             
     | 
| 
      
 327 
     | 
    
         
            +
                  unless needs_rebuild?
         
     | 
| 
      
 328 
     | 
    
         
            +
                    cn.raw_vector_with(@word_list) # make the lsi raw and norm vectors
         
     | 
| 
       304 
329 
     | 
    
         
             
                  end
         
     | 
| 
       305 
     | 
    
         
            -
             
     | 
| 
       306 
     | 
    
         
            -
                   
     | 
| 
      
 330 
     | 
    
         
            +
             
     | 
| 
      
 331 
     | 
    
         
            +
                  cn
         
     | 
| 
       307 
332 
     | 
    
         
             
                end
         
     | 
| 
       308 
     | 
    
         
            -
             
     | 
| 
      
 333 
     | 
    
         
            +
             
     | 
| 
       309 
334 
     | 
    
         
             
                def make_word_list
         
     | 
| 
       310 
335 
     | 
    
         
             
                  @word_list = WordList.new
         
     | 
| 
       311 
336 
     | 
    
         
             
                  @items.each_value do |node|
         
     | 
| 
       312 
337 
     | 
    
         
             
                    node.word_hash.each_key { |key| @word_list.add_word key }
         
     | 
| 
       313 
338 
     | 
    
         
             
                  end
         
     | 
| 
       314 
339 
     | 
    
         
             
                end
         
     | 
| 
       315 
     | 
    
         
            -
             
     | 
| 
       316 
340 
     | 
    
         
             
              end
         
     | 
| 
       317 
341 
     | 
    
         
             
            end
         
     | 
| 
       318 
     | 
    
         
            -
             
     | 
    
        data/lib/classifier.rb
    CHANGED
    
    
    
        data/test/test_helper.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | 
         @@ -1,36 +1,80 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: classifier
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 1. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 1.4.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Lucas Carlson
         
     | 
| 
       8 
     | 
    
         
            -
            autorequire: 
     | 
| 
      
 8 
     | 
    
         
            +
            autorequire:
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date:  
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2024-07-31 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: fast-stemmer
         
     | 
| 
       15 
15 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
       16 
16 
     | 
    
         
             
                requirements:
         
     | 
| 
       17 
     | 
    
         
            -
                - -  
     | 
| 
      
 17 
     | 
    
         
            +
                - - "~>"
         
     | 
| 
       18 
18 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       19 
19 
     | 
    
         
             
                    version: 1.0.0
         
     | 
| 
       20 
20 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       21 
21 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       22 
22 
     | 
    
         
             
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
       23 
23 
     | 
    
         
             
                requirements:
         
     | 
| 
       24 
     | 
    
         
            -
                - -  
     | 
| 
      
 24 
     | 
    
         
            +
                - - "~>"
         
     | 
| 
       25 
25 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       26 
26 
     | 
    
         
             
                    version: 1.0.0
         
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
      
 27 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 28 
     | 
    
         
            +
              name: rake
         
     | 
| 
      
 29 
     | 
    
         
            +
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
      
 30 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 31 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 32 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 33 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 34 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 35 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 36 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
      
 37 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 38 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 39 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 40 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 41 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 42 
     | 
    
         
            +
              name: minitest
         
     | 
| 
      
 43 
     | 
    
         
            +
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
      
 44 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 45 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 46 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 47 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 48 
     | 
    
         
            +
              type: :development
         
     | 
| 
      
 49 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 50 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
      
 51 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 52 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 53 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 54 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 55 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 56 
     | 
    
         
            +
              name: rdoc
         
     | 
| 
      
 57 
     | 
    
         
            +
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
      
 58 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 59 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 60 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 61 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 62 
     | 
    
         
            +
              type: :development
         
     | 
| 
      
 63 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 64 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
      
 65 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 66 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 67 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 68 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 69 
     | 
    
         
            +
            description: A general classifier module to allow Bayesian and other types of classifications.
         
     | 
| 
       29 
70 
     | 
    
         
             
            email: lucas@rufy.com
         
     | 
| 
       30 
71 
     | 
    
         
             
            executables: []
         
     | 
| 
       31 
72 
     | 
    
         
             
            extensions: []
         
     | 
| 
       32 
73 
     | 
    
         
             
            extra_rdoc_files: []
         
     | 
| 
       33 
74 
     | 
    
         
             
            files:
         
     | 
| 
      
 75 
     | 
    
         
            +
            - LICENSE
         
     | 
| 
      
 76 
     | 
    
         
            +
            - bin/bayes.rb
         
     | 
| 
      
 77 
     | 
    
         
            +
            - bin/summarize.rb
         
     | 
| 
       34 
78 
     | 
    
         
             
            - lib/classifier.rb
         
     | 
| 
       35 
79 
     | 
    
         
             
            - lib/classifier/bayes.rb
         
     | 
| 
       36 
80 
     | 
    
         
             
            - lib/classifier/extensions/string.rb
         
     | 
| 
         @@ -41,39 +85,28 @@ files: 
     | 
|
| 
       41 
85 
     | 
    
         
             
            - lib/classifier/lsi/content_node.rb
         
     | 
| 
       42 
86 
     | 
    
         
             
            - lib/classifier/lsi/summary.rb
         
     | 
| 
       43 
87 
     | 
    
         
             
            - lib/classifier/lsi/word_list.rb
         
     | 
| 
       44 
     | 
    
         
            -
            - bin/bayes.rb
         
     | 
| 
       45 
     | 
    
         
            -
            - bin/summarize.rb
         
     | 
| 
       46 
     | 
    
         
            -
            - test/bayes/bayesian_test.rb
         
     | 
| 
       47 
     | 
    
         
            -
            - test/extensions/word_hash_test.rb
         
     | 
| 
       48 
     | 
    
         
            -
            - test/lsi/lsi_test.rb
         
     | 
| 
       49 
88 
     | 
    
         
             
            - test/test_helper.rb
         
     | 
| 
       50 
     | 
    
         
            -
             
     | 
| 
       51 
     | 
    
         
            -
             
     | 
| 
       52 
     | 
    
         
            -
            -  
     | 
| 
       53 
     | 
    
         
            -
            - README.markdown
         
     | 
| 
       54 
     | 
    
         
            -
            - Rakefile
         
     | 
| 
       55 
     | 
    
         
            -
            homepage: http://classifier.rufy.com/
         
     | 
| 
       56 
     | 
    
         
            -
            licenses: []
         
     | 
| 
      
 89 
     | 
    
         
            +
            homepage: https://github.com/cardmagic/classifier
         
     | 
| 
      
 90 
     | 
    
         
            +
            licenses:
         
     | 
| 
      
 91 
     | 
    
         
            +
            - LGPL
         
     | 
| 
       57 
92 
     | 
    
         
             
            metadata: {}
         
     | 
| 
       58 
     | 
    
         
            -
            post_install_message: 
     | 
| 
      
 93 
     | 
    
         
            +
            post_install_message:
         
     | 
| 
       59 
94 
     | 
    
         
             
            rdoc_options: []
         
     | 
| 
       60 
95 
     | 
    
         
             
            require_paths:
         
     | 
| 
       61 
96 
     | 
    
         
             
            - lib
         
     | 
| 
       62 
97 
     | 
    
         
             
            required_ruby_version: !ruby/object:Gem::Requirement
         
     | 
| 
       63 
98 
     | 
    
         
             
              requirements:
         
     | 
| 
       64 
     | 
    
         
            -
              - -  
     | 
| 
      
 99 
     | 
    
         
            +
              - - ">="
         
     | 
| 
       65 
100 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       66 
101 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       67 
102 
     | 
    
         
             
            required_rubygems_version: !ruby/object:Gem::Requirement
         
     | 
| 
       68 
103 
     | 
    
         
             
              requirements:
         
     | 
| 
       69 
     | 
    
         
            -
              - -  
     | 
| 
      
 104 
     | 
    
         
            +
              - - ">="
         
     | 
| 
       70 
105 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       71 
106 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       72 
     | 
    
         
            -
            requirements:
         
     | 
| 
       73 
     | 
    
         
            -
             
     | 
| 
       74 
     | 
    
         
            -
             
     | 
| 
       75 
     | 
    
         
            -
            rubygems_version: 2.0.3
         
     | 
| 
       76 
     | 
    
         
            -
            signing_key: 
         
     | 
| 
      
 107 
     | 
    
         
            +
            requirements: []
         
     | 
| 
      
 108 
     | 
    
         
            +
            rubygems_version: 3.5.9
         
     | 
| 
      
 109 
     | 
    
         
            +
            signing_key:
         
     | 
| 
       77 
110 
     | 
    
         
             
            specification_version: 4
         
     | 
| 
       78 
111 
     | 
    
         
             
            summary: A general classifier module to allow Bayesian and other types of classifications.
         
     | 
| 
       79 
112 
     | 
    
         
             
            test_files: []
         
     | 
    
        data/Gemfile
    DELETED
    
    
    
        data/Gemfile.lock
    DELETED
    
    | 
         @@ -1,26 +0,0 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            GEM
         
     | 
| 
       2 
     | 
    
         
            -
              remote: https://rubygems.org/
         
     | 
| 
       3 
     | 
    
         
            -
              specs:
         
     | 
| 
       4 
     | 
    
         
            -
                diff-lcs (1.2.5)
         
     | 
| 
       5 
     | 
    
         
            -
                fast-stemmer (1.0.2)
         
     | 
| 
       6 
     | 
    
         
            -
                json (1.8.1)
         
     | 
| 
       7 
     | 
    
         
            -
                rake (10.1.1)
         
     | 
| 
       8 
     | 
    
         
            -
                rdoc (4.1.0)
         
     | 
| 
       9 
     | 
    
         
            -
                  json (~> 1.4)
         
     | 
| 
       10 
     | 
    
         
            -
                rspec (2.14.1)
         
     | 
| 
       11 
     | 
    
         
            -
                  rspec-core (~> 2.14.0)
         
     | 
| 
       12 
     | 
    
         
            -
                  rspec-expectations (~> 2.14.0)
         
     | 
| 
       13 
     | 
    
         
            -
                  rspec-mocks (~> 2.14.0)
         
     | 
| 
       14 
     | 
    
         
            -
                rspec-core (2.14.7)
         
     | 
| 
       15 
     | 
    
         
            -
                rspec-expectations (2.14.4)
         
     | 
| 
       16 
     | 
    
         
            -
                  diff-lcs (>= 1.1.3, < 2.0)
         
     | 
| 
       17 
     | 
    
         
            -
                rspec-mocks (2.14.4)
         
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
            PLATFORMS
         
     | 
| 
       20 
     | 
    
         
            -
              ruby
         
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
            DEPENDENCIES
         
     | 
| 
       23 
     | 
    
         
            -
              fast-stemmer
         
     | 
| 
       24 
     | 
    
         
            -
              rake
         
     | 
| 
       25 
     | 
    
         
            -
              rdoc
         
     | 
| 
       26 
     | 
    
         
            -
              rspec
         
     |