RubyGems - classifier - Versions diffs - 1.3.4 → 1.4.0 - Mend

classifier 1.3.4 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +5 -5
data/LICENSE +2 -2
data/lib/classifier/bayes.rb +132 -124
data/lib/classifier/extensions/string.rb +1 -1
data/lib/classifier/extensions/vector.rb +72 -78
data/lib/classifier/extensions/vector_serialize.rb +8 -10
data/lib/classifier/extensions/word_hash.rb +114 -120
data/lib/classifier/lsi/content_node.rb +39 -37
data/lib/classifier/lsi/summary.rb +24 -24
data/lib/classifier/lsi/word_list.rb +7 -8
data/lib/classifier/lsi.rb +174 -151
data/lib/classifier.rb +2 -1
data/test/test_helper.rb +3 -2
metadata +60 -27
data/Gemfile +0 -5
data/Gemfile.lock +0 -26
data/README.markdown +0 -97
data/Rakefile +0 -84
data/test/bayes/bayesian_test.rb +0 -33
data/test/extensions/word_hash_test.rb +0 -35
data/test/lsi/lsi_test.rb +0 -123

data/lib/classifier/lsi.rb CHANGED Viewed

@@ -3,54 +3,55 @@
 # License::   LGPL
 begin
-   raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
-   require 'gsl' # requires http://rb-gsl.rubyforge.org/
-   require 'classifier/extensions/vector_serialize'
-   $GSL = true
+  # to test the native vector class, try `rake test NATIVE_VECTOR=true`
+  raise LoadError if ENV['NATIVE_VECTOR'] == 'true'
+  require 'gsl' # requires https://github.com/SciRuby/rb-gsl/
+  require 'classifier/extensions/vector_serialize'
+  $GSL = true
 rescue LoadError
-	warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
-	require 'classifier/extensions/vector'
+  warn 'Notice: for 10x faster LSI support, please install https://github.com/SciRuby/rb-gsl/'
+  $GSL = false
+  require 'classifier/extensions/vector'
 end
 require 'classifier/lsi/word_list'
 require 'classifier/lsi/content_node'
 require 'classifier/lsi/summary'
 module Classifier
   # This class implements a Latent Semantic Indexer, which can search, classify and cluster
   # data based on underlying semantic relations. For more information on the algorithms used,
   # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
   class LSI
     attr_reader :word_list
     attr_accessor :auto_rebuild
     # Create a fresh index.
     # If you want to call #build_index manually, use
     #      Classifier::LSI.new :auto_rebuild => false
     #
     def initialize(options = {})
       @auto_rebuild = true unless options[:auto_rebuild] == false
-      @word_list, @items = WordList.new, {}
-      @version, @built_at_version = 0, -1
+      @word_list = WordList.new
+      @items = {}
+      @version = 0
+      @built_at_version = -1
     end
     # Returns true if the index needs to be rebuilt.  The index needs
     # to be built after all informaton is added, but before you start
     # using it for search, classification and cluster detection.
     def needs_rebuild?
       (@items.keys.size > 1) && (@version != @built_at_version)
     end
-    # Adds an item to the index. item is assumed to be a string, but
+    # Adds an item to the index. item is assumed to be a string, but
     # any item may be indexed so long as it responds to #to_s or if
-    # you provide an optional block explaining how the indexer can
+    # you provide an optional block explaining how the indexer can
     # fetch fresh string data. This optional block is passed the item,
     # so the item may only be a reference to a URL or file name.
-    #
+    #
     # For example:
     #   lsi = Classifier::LSI.new
     #   lsi.add_item "This is just plain text"
@@ -58,226 +59,252 @@ module Classifier
     #   ar = ActiveRecordObject.find( :all )
     #   lsi.add_item ar, *ar.categories { |x| ar.content }
     #
-    def add_item( item, *categories, &block )
+    def add_item(item, *categories, &block)
       clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
       @items[item] = ContentNode.new(clean_word_hash, *categories)
       @version += 1
       build_index if @auto_rebuild
     end
-    # A less flexible shorthand for add_item that assumes
+    # A less flexible shorthand for add_item that assumes
     # you are passing in a string with no categorries. item
-    # will be duck typed via to_s .
+    # will be duck typed via to_s .
     #
-    def <<( item )
-      add_item item
+    def <<(item)
+      add_item(item)
     end
     # Returns the categories for a given indexed items. You are free to add and remove
     # items from this as you see fit. It does not invalide an index to change its categories.
     def categories_for(item)
       return [] unless @items[item]
-      return @items[item].categories
+      @items[item].categories
     end
-    # Removes an item from the database, if it is indexed.
+    # Removes an item from the database, if it is indexed.
     #
-    def remove_item( item )
-      if @items.keys.contain? item
-        @items.remove item
-        @version += 1
-      end
+    def remove_item(item)
+      return unless @items.key?(item)
+      @items.delete(item)
+      @version += 1
     end
-    # Returns an array of items that are indexed.
+    # Returns an array of items that are indexed.
     def items
       @items.keys
     end
-    # Returns the categories for a given indexed items. You are free to add and remove
-    # items from this as you see fit. It does not invalide an index to change its categories.
-    def categories_for(item)
-      return [] unless @items[item]
-      return @items[item].categories
-    end
     # This function rebuilds the index if needs_rebuild? returns true.
     # For very large document spaces, this indexing operation may take some
-    # time to complete, so it may be wise to place the operation in another
-    # thread.
+    # time to complete, so it may be wise to place the operation in another
+    # thread.
     #
     # As a rule, indexing will be fairly swift on modern machines until
-    # you have well over 500 documents indexed, or have an incredibly diverse
-    # vocabulary for your documents.
+    # you have well over 500 documents indexed, or have an incredibly diverse
+    # vocabulary for your documents.
     #
     # The optional parameter "cutoff" is a tuning parameter. When the index is
-    # built, a certain number of s-values are discarded from the system. The
+    # built, a certain number of s-values are discarded from the system. The
     # cutoff parameter tells the indexer how many of these values to keep.
     # A value of 1 for cutoff means that no semantic analysis will take place,
     # turning the LSI class into a simple vector search engine.
-    def build_index( cutoff=0.75 )
+    def build_index(cutoff = 0.75)
       return unless needs_rebuild?
       make_word_list
       doc_list = @items.values
-      tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
+      tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
       if $GSL
-         tdm = GSL::Matrix.alloc(*tda).trans
-         ntdm = build_reduced_matrix(tdm, cutoff)
-         ntdm.size[1].times do |col|
-           vec = GSL::Vector.alloc( ntdm.column(col) ).row
-           doc_list[col].lsi_vector = vec
-           doc_list[col].lsi_norm = vec.normalize
-         end
+        tdm = GSL::Matrix.alloc(*tda).trans
+        ntdm = build_reduced_matrix(tdm, cutoff)
+        ntdm.size[1].times do |col|
+          vec = GSL::Vector.alloc(ntdm.column(col)).row
+          doc_list[col].lsi_vector = vec
+          doc_list[col].lsi_norm = vec.normalize
+        end
       else
-         tdm = Matrix.rows(tda).trans
-         ntdm = build_reduced_matrix(tdm, cutoff)
-         ntdm.row_size.times do |col|
-           doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
-           doc_list[col].lsi_norm = ntdm.column(col).normalize  if doc_list[col]
-         end
+        tdm = Matrix.rows(tda).trans
+        ntdm = build_reduced_matrix(tdm, cutoff)
+        ntdm.row_size.times do |col|
+          doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
+          doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
+        end
       end
       @built_at_version = @version
     end
     # This method returns max_chunks entries, ordered by their average semantic rating.
     # Essentially, the average distance of each entry from all other entries is calculated,
     # the highest are returned.
     #
     # This can be used to build a summary service, or to provide more information about
     # your dataset's general content. For example, if you were to use categorize on the
-    # results of this data, you could gather information on what your dataset is generally
+    # results of this data, you could gather information on what your dataset is generally
     # about.
-    def highest_relative_content( max_chunks=10 )
-       return [] if needs_rebuild?
-       avg_density = Hash.new
-       @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
-       avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
+    def highest_relative_content(max_chunks = 10)
+      return [] if needs_rebuild?
+      avg_density = {}
+      @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x, y| x + y[1] } }
+      avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks - 1].map
     end
-    # This function is the primitive that find_related and classify
+    # This function is the primitive that find_related and classify
     # build upon. It returns an array of 2-element arrays. The first element
     # of this array is a document, and the second is its "score", defining
     # how "close" it is to other indexed items.
-    #
+    #
     # These values are somewhat arbitrary, having to do with the vector space
     # created by your content, so the magnitude is interpretable but not always
-    # meaningful between indexes.
+    # meaningful between indexes.
     #
     # The parameter doc is the content to compare. If that content is not
-    # indexed, you can pass an optional block to define how to create the
-    # text data. See add_item for examples of how this works.
-    def proximity_array_for_content( doc, &block )
+    # indexed, you can pass an optional block to define how to create the
+    # text data. See add_item for examples of how this works.
+    def proximity_array_for_content(doc, &block)
       return [] if needs_rebuild?
-      content_node = node_for_content( doc, &block )
-      result =
+      content_node = node_for_content(doc, &block)
+      result =
         @items.keys.collect do |item|
-          if $GSL
-             val = content_node.search_vector * @items[item].search_vector.col
-          else
-             val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
-          end
+          val = if $GSL
+                  content_node.search_vector * @items[item].search_vector.col
+                else
+                  (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
+                end
           [item, val]
         end
       result.sort_by { |x| x[1] }.reverse
-    end
+    end
     # Similar to proximity_array_for_content, this function takes similar
     # arguments and returns a similar array. However, it uses the normalized
-    # calculated vectors instead of their full versions. This is useful when
+    # calculated vectors instead of their full versions. This is useful when
     # you're trying to perform operations on content that is much smaller than
     # the text you're working with. search uses this primitive.
-    def proximity_norms_for_content( doc, &block )
+    def proximity_norms_for_content(doc, &block)
       return [] if needs_rebuild?
-      content_node = node_for_content( doc, &block )
-      result =
+      content_node = node_for_content(doc, &block)
+      result =
         @items.keys.collect do |item|
-          if $GSL
-            val = content_node.search_norm * @items[item].search_norm.col
-          else
-            val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
-          end
+          val = if $GSL
+                  content_node.search_norm * @items[item].search_norm.col
+                else
+                  (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
+                end
           [item, val]
         end
       result.sort_by { |x| x[1] }.reverse
-    end
+    end
     # This function allows for text-based search of your index. Unlike other functions
     # like find_related and classify, search only takes short strings. It will also ignore
-    # factors like repeated words. It is best for short, google-like search terms.
-    # A search will first priortize lexical relationships, then semantic ones.
+    # factors like repeated words. It is best for short, google-like search terms.
+    # A search will first priortize lexical relationships, then semantic ones.
     #
     # While this may seem backwards compared to the other functions that LSI supports,
     # it is actually the same algorithm, just applied on a smaller document.
-    def search( string, max_nearest=3 )
+    def search(string, max_nearest = 3)
       return [] if needs_rebuild?
-      carry = proximity_norms_for_content( string )
+      carry = proximity_norms_for_content(string)
       result = carry.collect { |x| x[0] }
-      return result[0..max_nearest-1]
+      result[0..max_nearest - 1]
     end
     # This function takes content and finds other documents
     # that are semantically "close", returning an array of documents sorted
     # from most to least relavant.
-    # max_nearest specifies the number of documents to return. A value of
-    # 0 means that it returns all the indexed documents, sorted by relavence.
+    # max_nearest specifies the number of documents to return. A value of
+    # 0 means that it returns all the indexed documents, sorted by relavence.
     #
-    # This is particularly useful for identifing clusters in your document space.
+    # This is particularly useful for identifing clusters in your document space.
     # For example you may want to identify several "What's Related" items for weblog
     # articles, or find paragraphs that relate to each other in an essay.
-    def find_related( doc, max_nearest=3, &block )
-      carry =
-        proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
+    def find_related(doc, max_nearest = 3, &block)
+      carry =
+        proximity_array_for_content(doc, &block).reject { |pair| pair[0] == doc }
       result = carry.collect { |x| x[0] }
-      return result[0..max_nearest-1]
+      result[0..max_nearest - 1]
     end
-    # This function uses a voting system to categorize documents, based on
-    # the categories of other documents. It uses the same logic as the
+    # This function uses a voting system to categorize documents, based on
+    # the categories of other documents. It uses the same logic as the
     # find_related function to find related documents, then returns the
-    # most obvious category from this list.
+    # most obvious category from this list.
     #
-    # cutoff signifies the number of documents to consider when clasifying
-    # text. A cutoff of 1 means that every document in the index votes on
+    # cutoff signifies the number of documents to consider when clasifying
+    # text. A cutoff of 1 means that every document in the index votes on
     # what category the document is in. This may not always make sense.
     #
-    def classify( doc, cutoff=0.30, &block )
+    def classify(doc, cutoff = 0.30, &block)
+      votes = vote(doc, cutoff, &block)
+      ranking = votes.keys.sort_by { |x| votes[x] }
+      ranking[-1]
+    end
+    def vote(doc, cutoff = 0.30, &block)
       icutoff = (@items.size * cutoff).round
-      carry = proximity_array_for_content( doc, &block )
-      carry = carry[0..icutoff-1]
+      carry = proximity_array_for_content(doc, &block)
+      carry = carry[0..icutoff - 1]
       votes = {}
       carry.each do |pair|
         categories = @items[pair[0]].categories
-        categories.each do |category|
+        categories.each do |category|
           votes[category] ||= 0.0
-          votes[category] += pair[1]
+          votes[category] += pair[1]
         end
       end
+      votes
+    end
+    # Returns the same category as classify() but also returns
+    # a confidence value derived from the vote share that the
+    # winning category got.
+    #
+    # e.g.
+    # category,confidence = classify_with_confidence(doc)
+    # if confidence < 0.3
+    #   category = nil
+    # end
+    #
+    #
+    # See classify() for argument docs
+    def classify_with_confidence(doc, cutoff = 0.30, &block)
+      votes = vote(doc, cutoff, &block)
+      votes_sum = votes.values.inject(0.0) { |sum, v| sum + v }
+      return [nil, nil] if votes_sum.zero?
       ranking = votes.keys.sort_by { |x| votes[x] }
-      return ranking[-1]
+      winner = ranking[-1]
+      vote_share = votes[winner] / votes_sum.to_f
+      [winner, vote_share]
     end
     # Prototype, only works on indexed documents.
     # I have no clue if this is going to work, but in theory
     # it's supposed to.
-    def highest_ranked_stems( doc, count=3 )
-      raise "Requested stem ranking on non-indexed content!" unless @items[doc]
+    def highest_ranked_stems(doc, count = 3)
+      raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
       arr = node_for_content(doc).lsi_vector.to_a
-      top_n = arr.sort.reverse[0..count-1]
-      return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
+      top_n = arr.sort.reverse[0..count - 1]
+      top_n.collect { |x| @word_list.word_for_index(arr.index(x)) }
     end
     private
-    def build_reduced_matrix( matrix, cutoff=0.75 )
+    def build_reduced_matrix(matrix, cutoff = 0.75)
       # TODO: Check that M>=N on these dimensions! Transpose helps assure this
       u, v, s = matrix.SV_decomp
@@ -287,32 +314,28 @@ module Classifier
         s[ord] = 0.0 if s[ord] < s_cutoff
       end
       # Reconstruct the term document matrix, only with reduced rank
-      u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
+      u * ($GSL ? GSL::Matrix : ::Matrix).diag(s) * v.trans
     end
-    def node_for_content(item, &block)
-      if @items[item]
-        return @items[item]
-      else
-        clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
-        cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
+    def node_for_content(item, &block)
+      return @items[item] if @items[item]
-        unless needs_rebuild?
-          cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
-        end
+      clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+      cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
+      unless needs_rebuild?
+        cn.raw_vector_with(@word_list) # make the lsi raw and norm vectors
       end
-      return cn
+      cn
     end
     def make_word_list
       @word_list = WordList.new
       @items.each_value do |node|
         node.word_hash.each_key { |key| @word_list.add_word key }
       end
     end
   end
 end

data/lib/classifier.rb CHANGED Viewed

@@ -26,5 +26,6 @@
 require 'rubygems'
 require 'classifier/extensions/string'
+require 'classifier/extensions/vector'
 require 'classifier/bayes'
-require 'classifier/lsi'
+require 'classifier/lsi'

data/test/test_helper.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 $:.unshift(File.dirname(__FILE__) + '/../lib')
-require 'test/unit'
-require 'classifier'
+require 'minitest'
+require 'minitest/autorun'
+require 'classifier'

metadata CHANGED Viewed

@@ -1,36 +1,80 @@
 --- !ruby/object:Gem::Specification
 name: classifier
 version: !ruby/object:Gem::Version
-  version: 1.3.4
+  version: 1.4.0
 platform: ruby
 authors:
 - Lucas Carlson
-autorequire: classifier
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-12-31 00:00:00.000000000 Z
+date: 2024-07-31 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: fast-stemmer
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 1.0.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 1.0.0
-description: |2
-     A general classifier module to allow Bayesian and other types of classifications.
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: minitest
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rdoc
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: A general classifier module to allow Bayesian and other types of classifications.
 email: lucas@rufy.com
 executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- LICENSE
+- bin/bayes.rb
+- bin/summarize.rb
 - lib/classifier.rb
 - lib/classifier/bayes.rb
 - lib/classifier/extensions/string.rb
@@ -41,39 +85,28 @@ files:
 - lib/classifier/lsi/content_node.rb
 - lib/classifier/lsi/summary.rb
 - lib/classifier/lsi/word_list.rb
-- bin/bayes.rb
-- bin/summarize.rb
-- test/bayes/bayesian_test.rb
-- test/extensions/word_hash_test.rb
-- test/lsi/lsi_test.rb
 - test/test_helper.rb
-- Gemfile
-- Gemfile.lock
-- LICENSE
-- README.markdown
-- Rakefile
-homepage: http://classifier.rufy.com/
-licenses: []
+homepage: https://github.com/cardmagic/classifier
+licenses:
+- LGPL
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
-requirements:
-- A porter-stemmer module to split word stems.
-rubyforge_project:
-rubygems_version: 2.0.3
-signing_key:
+requirements: []
+rubygems_version: 3.5.9
+signing_key:
 specification_version: 4
 summary: A general classifier module to allow Bayesian and other types of classifications.
 test_files: []

data/Gemfile DELETED Viewed

@@ -1,5 +0,0 @@
-source 'https://rubygems.org'
-gem 'rake'
-gem 'rspec', :require => 'spec'
-gem 'rdoc'
-gem 'fast-stemmer'

data/Gemfile.lock DELETED Viewed

@@ -1,26 +0,0 @@
-GEM
-  remote: https://rubygems.org/
-  specs:
-    diff-lcs (1.2.5)
-    fast-stemmer (1.0.2)
-    json (1.8.1)
-    rake (10.1.1)
-    rdoc (4.1.0)
-      json (~> 1.4)
-    rspec (2.14.1)
-      rspec-core (~> 2.14.0)
-      rspec-expectations (~> 2.14.0)
-      rspec-mocks (~> 2.14.0)
-    rspec-core (2.14.7)
-    rspec-expectations (2.14.4)
-      diff-lcs (>= 1.1.3, < 2.0)
-    rspec-mocks (2.14.4)
-PLATFORMS
-  ruby
-DEPENDENCIES
-  fast-stemmer
-  rake
-  rdoc
-  rspec