RubyGems - classifier-reborn - Versions diffs - 2.0.3 → 2.0.4 - Mend

classifier-reborn 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/README.markdown +130 -14
data/data/stopwords/ca +126 -0
data/data/stopwords/cs +138 -0
data/data/stopwords/da +101 -0
data/data/stopwords/de +604 -0
data/data/stopwords/en +80 -0
data/data/stopwords/es +351 -0
data/data/stopwords/fi +747 -0
data/data/stopwords/fr +463 -0
data/data/stopwords/hu +35 -0
data/data/stopwords/it +430 -0
data/data/stopwords/nl +48 -0
data/data/stopwords/no +119 -0
data/data/stopwords/pl +93 -0
data/data/stopwords/pt +356 -0
data/data/stopwords/se +386 -0
data/data/stopwords/tr +114 -0
data/lib/classifier-reborn/bayes.rb +86 -16
data/lib/classifier-reborn/category_namer.rb +3 -1
data/lib/classifier-reborn/extensions/hasher.rb +25 -100
data/lib/classifier-reborn/extensions/vector.rb +0 -1
data/lib/classifier-reborn/lsi.rb +36 -25
data/lib/classifier-reborn/lsi/cached_content_node.rb +48 -0
data/lib/classifier-reborn/lsi/content_node.rb +27 -10
data/lib/classifier-reborn/lsi/summarizer.rb +2 -2
data/lib/classifier-reborn/version.rb +1 -1
metadata +37 -3

data/lib/classifier-reborn/bayes.rb CHANGED

@@ -6,15 +6,40 @@ require_relative 'category_namer'
 module ClassifierReborn
   class Bayes
+    CategoryNotFoundError = Class.new(StandardError)
     # The class can be created with one or more categories, each of which will be
     # initialized and given a training method. E.g.,
     #      b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
-    def initialize(*categories)
+    #
+    # Options available are:
+    #   language:         'en'   Used to select language specific stop words
+    #   auto_categorize:  false  When true, enables ability to dynamically declare a category
+    #   enable_threshold: false  When true, enables a threshold requirement for classifition
+    #   threshold:        0.0    Default threshold, only used when enabled
+    def initialize(*args)
       @categories = Hash.new
-      categories.each { |category| @categories[CategoryNamer.prepare_name(category)] = Hash.new }
-      @total_words = 0
-      @category_counts = Hash.new(0)
-      @category_word_count = Hash.new
+      options = { language:         'en',
+                  auto_categorize:  false,
+                  enable_threshold: false,
+                  threshold:        0.0
+                }
+      args.flatten.each { |arg|
+        if arg.kind_of?(Hash)
+          options.merge!(arg)
+        else
+          add_category(arg)
+        end
+      }
+      @total_words         = 0
+      @category_counts     = Hash.new(0)
+      @category_word_count = Hash.new(0)
+      @language            = options[:language]
+      @auto_categorize     = options[:auto_categorize]
+      @enable_threshold    = options[:enable_threshold]
+      @threshold           = options[:threshold]
     end
     # Provides a general training method for all categories specified in Bayes#new
@@ -25,10 +50,18 @@ module ClassifierReborn
     #     b.train "The other", "The other text"
     def train(category, text)
       category = CategoryNamer.prepare_name(category)
-      @category_word_count[category] ||= 0
+      # Add the category dynamically or raise an error
+      if !@categories.has_key?(category)
+        if @auto_categorize
+          add_category(category)
+        else
+          raise CategoryNotFoundError.new("Cannot train; category #{category} does not exist")
+        end
+      end
       @category_counts[category] += 1
-      Hasher.word_hash(text).each do |word, count|
-        @categories[category][word]     ||=     0
+      Hasher.word_hash(text, @language).each do |word, count|
         @categories[category][word]      +=     count
         @category_word_count[category]   += count
         @total_words += count
@@ -44,12 +77,10 @@ module ClassifierReborn
     #     b.untrain :this, "This text"
     def untrain(category, text)
       category = CategoryNamer.prepare_name(category)
-      @category_word_count[category] ||= 0
       @category_counts[category] -= 1
-      Hasher.word_hash(text).each do |word, count|
+      Hasher.word_hash(text, @language).each do |word, count|
         if @total_words >= 0
           orig = @categories[category][word] || 0
-          @categories[category][word]     ||=     0
           @categories[category][word]      -=     count
           if @categories[category][word] <= 0
             @categories[category].delete(word)
@@ -70,7 +101,7 @@ module ClassifierReborn
     # The largest of these scores (the one closest to 0) is the one picked out by #classify
     def classifications(text)
       score = Hash.new
-      word_hash = Hasher.word_hash(text)
+      word_hash = Hasher.word_hash(text, @language)
       training_count = @category_counts.values.reduce(:+).to_f
       @categories.each do |category, category_words|
         score[category.to_s] = 0
@@ -87,11 +118,50 @@ module ClassifierReborn
     end
     # Returns the classification of the provided +text+, which is one of the
-    # categories given in the initializer. E.g.,
+    # categories given in the initializer along with the score. E.g.,
     #    b.classify "I hate bad words and you"
-    #    =>  'Uninteresting'
+    #    =>  ['Uninteresting', -4.852030263919617]
+    def classify_with_score(text)
+      (classifications(text).sort_by { |a| -a[1] })[0]
+    end
+    # Return the classification without the score
     def classify(text)
-      (classifications(text).sort_by { |a| -a[1] })[0][0]
+      result, score = classify_with_score(text)
+      if threshold_enabled?
+        result = nil if score < @threshold || score == Float::INFINITY
+      end
+      return result
+    end
+    # Retrieve the current threshold value
+    def threshold
+      @threshold
+    end
+    # Dynamically set the threshold value
+    def threshold=(a_float)
+      @threshold = a_float
+    end
+    # Dynamically enable threshold for classify results
+    def enable_threshold
+      @enable_threshold = true
+    end
+    # Dynamically disable threshold for classify results
+    def disable_threshold
+      @enable_threshold = false
+    end
+    # Is threshold processing enabled?
+    def threshold_enabled?
+      @enable_threshold
+    end
+    # is threshold processing disabled?
+    def threshold_disabled?
+      !@enable_threshold
     end
     # Provides training and untraining methods for the categories specified in Bayes#new
@@ -130,7 +200,7 @@ module ClassifierReborn
     # more criteria than the trained selective categories. In short,
     # try to initialize your categories at initialization.
     def add_category(category)
-      @categories[CategoryNamer.prepare_name(category)] = Hash.new
+      @categories[CategoryNamer.prepare_name(category)] ||= Hash.new(0)
     end
     alias append_category add_category

data/lib/classifier-reborn/category_namer.rb CHANGED

@@ -9,7 +9,9 @@ module ClassifierReborn
   module CategoryNamer
     extend self
     def prepare_name(name)
-      name.to_s.gsub("_"," ").capitalize.intern
+      return name if name.is_a?(Symbol)
+      name.to_s.gsub("_"," ").capitalize.intern
     end
   end
 end

data/lib/classifier-reborn/extensions/hasher.rb CHANGED

@@ -1,39 +1,33 @@
+# encoding: utf-8
 # Author::    Lucas Carlson  (mailto:lucas@rufy.com)
 # Copyright:: Copyright (c) 2005 Lucas Carlson
 # License::   LGPL
-require "set"
+require 'set'
 module ClassifierReborn
   module Hasher
-    extend self
+    STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
-    # Removes common punctuation symbols, returning a new string.
-    # E.g.,
-    #   "Hello (greeting's), with {braces} < >...?".without_punctuation
-    #   => "Hello  greetings   with  braces         "
-    def without_punctuation(str)
-      str .tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
-    end
+    extend self
     # Return a Hash of strings => ints. Each word in the string is stemmed,
     # interned, and indexes to its frequency in the document.
-    def word_hash(str)
-      word_hash   = clean_word_hash(str)
-      symbol_hash = word_hash_for_symbols(str.gsub(/[\w]/," ").split)
-      return clean_word_hash(str).merge(symbol_hash)
+    def word_hash(str, language = 'en')
+      cleaned_word_hash = clean_word_hash(str, language)
+      symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
+      return cleaned_word_hash.merge(symbol_hash)
     end
     # Return a word hash without extra punctuation or short symbols, just stemmed words
-    def clean_word_hash(str)
-      word_hash_for_words str.gsub(/[^\w\s]/,"").split
+    def clean_word_hash(str, language = 'en')
+      word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
     end
-    def word_hash_for_words(words)
+    def word_hash_for_words(words, language = 'en')
       d = Hash.new(0)
       words.each do |word|
-        word.downcase!
-        if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
+        if word.length > 2 && !STOPWORDS[language].include?(word)
           d[word.stem.intern] += 1
         end
       end
@@ -48,87 +42,18 @@ module ClassifierReborn
       return d
     end
-    CORPUS_SKIP_WORDS = Set.new(%w[
-      a
-      again
-      all
-      along
-      are
-      also
-      an
-      and
-      as
-      at
-      but
-      by
-      came
-      can
-      cant
-      couldnt
-      did
-      didn
-      didnt
-      do
-      doesnt
-      dont
-      ever
-      first
-      from
-      have
-      her
-      here
-      him
-      how
-      i
-      if
-      in
-      into
-      is
-      isnt
-      it
-      itll
-      just
-      last
-      least
-      like
-      most
-      my
-      new
-      no
-      not
-      now
-      of
-      on
-      or
-      should
-      sinc
-      so
-      some
-      th
-      than
-      this
-      that
-      the
-      their
-      then
-      those
-      to
-      told
-      too
-      true
-      try
-      until
-      url
-      us
-      were
-      when
-      whether
-      while
-      with
-      within
-      yes
-      you
-      youll
-    ])
+    # Create a lazily-loaded hash of stopword data
+    STOPWORDS = Hash.new do |hash, language|
+      hash[language] = []
+      STOPWORDS_PATH.each do |path|
+        if File.exist?(File.join(path, language))
+          hash[language] = Set.new File.read(File.join(path, language.to_s)).split
+          break
+        end
+      end
+      hash[language]
+    end
   end
 end

data/lib/classifier-reborn/extensions/vector.rb CHANGED

@@ -21,7 +21,6 @@ class Matrix
     qrot    = q.dup
     v       = Matrix.identity(q.row_size)
-    azrot   = nil
     mzrot   = nil
     cnt     = 0
     s_old   = nil

data/lib/classifier-reborn/lsi.rb CHANGED

@@ -15,6 +15,7 @@ end
 require_relative 'lsi/word_list'
 require_relative 'lsi/content_node'
+require_relative 'lsi/cached_content_node'
 require_relative 'lsi/summarizer'
 module ClassifierReborn
@@ -24,24 +25,30 @@ module ClassifierReborn
   # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
   class LSI
-    attr_reader :word_list
+    attr_reader :word_list, :cache_node_vectors
     attr_accessor :auto_rebuild
     # Create a fresh index.
     # If you want to call #build_index manually, use
     #      ClassifierReborn::LSI.new :auto_rebuild => false
+    # If you want to use ContentNodes with cached vector transpositions, use
+    #      lsi = ClassifierReborn::LSI.new :cache_node_vectors => true
     #
     def initialize(options = {})
-      @auto_rebuild = true unless options[:auto_rebuild] == false
+      @auto_rebuild = options[:auto_rebuild] != false
       @word_list, @items = WordList.new, {}
       @version, @built_at_version = 0, -1
+      @language = options[:language] || 'en'
+      if @cache_node_vectors = options[:cache_node_vectors]
+        extend CachedContentNode::InstanceMethods
+      end
     end
     # Returns true if the index needs to be rebuilt.  The index needs
     # to be built after all informaton is added, but before you start
     # using it for search, classification and cluster detection.
     def needs_rebuild?
-      (@items.keys.size > 1) && (@version != @built_at_version)
+      (@items.size > 1) && (@version != @built_at_version)
     end
     # Adds an item to the index. item is assumed to be a string, but
@@ -58,8 +65,12 @@ module ClassifierReborn
     #   lsi.add_item ar, *ar.categories { |x| ar.content }
     #
     def add_item( item, *categories, &block )
-      clean_word_hash = Hasher.clean_word_hash(block ? block.call(item) : item.to_s)
-      @items[item] = ContentNode.new(clean_word_hash, *categories)
+      clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
+      @items[item] = if @cache_node_vectors
+        CachedContentNode.new(clean_word_hash, *categories)
+      else
+        ContentNode.new(clean_word_hash, *categories)
+      end
       @version += 1
       build_index if @auto_rebuild
     end
@@ -93,13 +104,6 @@ module ClassifierReborn
       @items.keys
     end
-    # Returns the categories for a given indexed items. You are free to add and remove
-    # items from this as you see fit. It does not invalide an index to change its categories.
-    def categories_for(item)
-      return [] unless @items[item]
-      return @items[item].categories
-    end
     # This function rebuilds the index if needs_rebuild? returns true.
     # For very large document spaces, this indexing operation may take some
     # time to complete, so it may be wise to place the operation in another
@@ -155,7 +159,7 @@ module ClassifierReborn
        return [] if needs_rebuild?
        avg_density = Hash.new
-       @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
+       @items.each_key { |item| avg_density[item] = proximity_array_for_content(item).inject(0.0) { |x,y| x + y[1]} }
        avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
     end
@@ -179,9 +183,9 @@ module ClassifierReborn
       result =
         @items.keys.collect do |item|
           if $GSL
-             val = content_node.search_vector * @items[item].search_vector.col
+            val = content_node.search_vector * @items[item].transposed_search_vector
           else
-             val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
+            val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
           end
           [item, val]
         end
@@ -234,35 +238,42 @@ module ClassifierReborn
     # articles, or find paragraphs that relate to each other in an essay.
     def find_related( doc, max_nearest=3, &block )
       carry =
-        proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
+        proximity_array_for_content( doc, &block ).reject { |pair| pair[0].eql? doc }
       result = carry.collect { |x| x[0] }
       return result[0..max_nearest-1]
     end
+    # Return the most obvious category with the score
+    def classify_with_score( doc, cutoff=0.30, &block)
+      return scored_categories(doc, cutoff, &block).last
+    end
+    # Return the most obvious category without the score
+    def classify( doc, cutoff=0.30, &block )
+      return scored_categories(doc, cutoff, &block).last.first
+    end
     # This function uses a voting system to categorize documents, based on
     # the categories of other documents. It uses the same logic as the
     # find_related function to find related documents, then returns the
-    # most obvious category from this list.
+    # list of sorted categories.
     #
     # cutoff signifies the number of documents to consider when clasifying
     # text. A cutoff of 1 means that every document in the index votes on
     # what category the document is in. This may not always make sense.
     #
-    def classify( doc, cutoff=0.30, &block )
+    def scored_categories( doc, cutoff=0.30, &block )
       icutoff = (@items.size * cutoff).round
       carry = proximity_array_for_content( doc, &block )
       carry = carry[0..icutoff-1]
-      votes = {}
+      votes = Hash.new(0.0)
       carry.each do |pair|
-        categories = @items[pair[0]].categories
-        categories.each do |category|
-          votes[category] ||= 0.0
+        @items[pair[0]].categories.each do |category|
           votes[category] += pair[1]
         end
       end
-      ranking = votes.keys.sort_by { |x| votes[x] }
-      return ranking[-1]
+      return votes.sort_by { |_, score| score }
     end
     # Prototype, only works on indexed documents.
@@ -293,7 +304,7 @@ module ClassifierReborn
       if @items[item]
         return @items[item]
       else
-        clean_word_hash = Hasher.clean_word_hash(block ? block.call(item) : item.to_s)
+        clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
         cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data