RubyGems - classifier-reborn - Versions diffs - 2.0.4 → 2.3.0 - Mend

classifier-reborn 2.0.4 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +5 -5
data/LICENSE +74 -1
data/README.markdown +57 -207
data/data/stopwords/ar +104 -0
data/data/stopwords/bn +362 -0
data/data/stopwords/hi +97 -0
data/data/stopwords/ja +43 -0
data/data/stopwords/ru +420 -0
data/data/stopwords/tr +199 -30
data/data/stopwords/vi +647 -0
data/data/stopwords/zh +125 -0
data/lib/classifier-reborn/backends/bayes_memory_backend.rb +77 -0
data/lib/classifier-reborn/backends/bayes_redis_backend.rb +109 -0
data/lib/classifier-reborn/backends/no_redis_error.rb +14 -0
data/lib/classifier-reborn/bayes.rb +141 -65
data/lib/classifier-reborn/category_namer.rb +6 -4
data/lib/classifier-reborn/extensions/hasher.rb +22 -39
data/lib/classifier-reborn/extensions/token_filter/stemmer.rb +24 -0
data/lib/classifier-reborn/extensions/token_filter/stopword.rb +48 -0
data/lib/classifier-reborn/extensions/token_filter/symbol.rb +20 -0
data/lib/classifier-reborn/extensions/tokenizer/token.rb +36 -0
data/lib/classifier-reborn/extensions/tokenizer/whitespace.rb +28 -0
data/lib/classifier-reborn/extensions/vector.rb +35 -28
data/lib/classifier-reborn/extensions/vector_serialize.rb +10 -10
data/lib/classifier-reborn/extensions/zero_vector.rb +7 -0
data/lib/classifier-reborn/lsi/cached_content_node.rb +6 -5
data/lib/classifier-reborn/lsi/content_node.rb +35 -25
data/lib/classifier-reborn/lsi/summarizer.rb +7 -5
data/lib/classifier-reborn/lsi/word_list.rb +5 -6
data/lib/classifier-reborn/lsi.rb +166 -94
data/lib/classifier-reborn/validators/classifier_validator.rb +170 -0
data/lib/classifier-reborn/version.rb +3 -1
data/lib/classifier-reborn.rb +12 -1
metadata +98 -17
data/bin/bayes.rb +0 -36
data/bin/summarize.rb +0 -16

data/lib/classifier-reborn/backends/bayes_memory_backend.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+module ClassifierReborn
+  class BayesMemoryBackend
+    attr_reader :total_words, :total_trainings
+    # This class provides Memory as the storage backend for the classifier data structures
+    def initialize
+      @total_words     = 0
+      @total_trainings = 0
+      @category_counts = {}
+      @categories      = {}
+    end
+    def update_total_words(diff)
+      @total_words += diff
+    end
+    def update_total_trainings(diff)
+      @total_trainings += diff
+    end
+    def category_training_count(category)
+      category_counts(category)[:training]
+    end
+    def update_category_training_count(category, diff)
+      category_counts(category)[:training] += diff
+    end
+    def category_has_trainings?(category)
+      @category_counts.key?(category) && category_training_count(category) > 0
+    end
+    def category_word_count(category)
+      category_counts(category)[:word]
+    end
+    def update_category_word_count(category, diff)
+      category_counts(category)[:word] += diff
+    end
+    def add_category(category)
+      @categories[category] ||= Hash.new(0)
+    end
+    def category_keys
+      @categories.keys
+    end
+    def category_word_frequency(category, word)
+      @categories[category][word]
+    end
+    def update_category_word_frequency(category, word, diff)
+      @categories[category][word] += diff
+    end
+    def delete_category_word(category, word)
+      @categories[category].delete(word)
+    end
+    def word_in_category?(category, word)
+      @categories[category].key?(word)
+    end
+    def reset
+      initialize
+    end
+    private
+    def category_counts(category)
+      @category_counts[category] ||= { training: 0, word: 0 }
+    end
+  end
+end

data/lib/classifier-reborn/backends/bayes_redis_backend.rb ADDED Viewed

@@ -0,0 +1,109 @@
+# frozen_string_literal: true
+require_relative 'no_redis_error'
+# require redis when we run #intialize. This way only people using this backend
+# will need to install and load the backend without having to
+# require 'classifier-reborn/backends/bayes_redis_backend'
+module ClassifierReborn
+  # This class provides Redis as the storage backend for the classifier data structures
+  class BayesRedisBackend
+    # The class can be created with the same arguments that the redis gem accepts
+    # E.g.,
+    #      b = ClassifierReborn::BayesRedisBackend.new
+    #      b = ClassifierReborn::BayesRedisBackend.new host: "10.0.1.1", port: 6380, db: 2
+    #      b = ClassifierReborn::BayesRedisBackend.new url: "redis://:secret@10.0.1.1:6380/2"
+    #
+    # Options available are:
+    #   url:                lambda { ENV["REDIS_URL"] }
+    #   scheme:             "redis"
+    #   host:               "127.0.0.1"
+    #   port:               6379
+    #   path:               nil
+    #   timeout:            5.0
+    #   password:           nil
+    #   db:                 0
+    #   driver:             nil
+    #   id:                 nil
+    #   tcp_keepalive:      0
+    #   reconnect_attempts: 1
+    #   inherit_socket:     false
+    def initialize(options = {})
+      begin # because some people don't have redis installed
+        require 'redis'
+      rescue LoadError
+        raise NoRedisError
+      end
+      @redis = Redis.new(options)
+      @redis.setnx(:total_words, 0)
+      @redis.setnx(:total_trainings, 0)
+    end
+    def total_words
+      @redis.get(:total_words).to_i
+    end
+    def update_total_words(diff)
+      @redis.incrby(:total_words, diff)
+    end
+    def total_trainings
+      @redis.get(:total_trainings).to_i
+    end
+    def update_total_trainings(diff)
+      @redis.incrby(:total_trainings, diff)
+    end
+    def category_training_count(category)
+      @redis.hget(:category_training_count, category).to_i
+    end
+    def update_category_training_count(category, diff)
+      @redis.hincrby(:category_training_count, category, diff)
+    end
+    def category_has_trainings?(category)
+      category_training_count(category) > 0
+    end
+    def category_word_count(category)
+      @redis.hget(:category_word_count, category).to_i
+    end
+    def update_category_word_count(category, diff)
+      @redis.hincrby(:category_word_count, category, diff)
+    end
+    def add_category(category)
+      @redis.sadd(:category_keys, category)
+    end
+    def category_keys
+      @redis.smembers(:category_keys).map(&:intern)
+    end
+    def category_word_frequency(category, word)
+      @redis.hget(category, word).to_i
+    end
+    def update_category_word_frequency(category, word, diff)
+      @redis.hincrby(category, word, diff)
+    end
+    def delete_category_word(category, word)
+      @redis.hdel(category, word)
+    end
+    def word_in_category?(category, word)
+      @redis.hexists(category, word)
+    end
+    def reset
+      @redis.flushdb
+      @redis.set(:total_words, 0)
+      @redis.set(:total_trainings, 0)
+    end
+  end
+end

data/lib/classifier-reborn/backends/no_redis_error.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# frozen_string_literal: true
+class NoRedisError < RuntimeError
+  def initialize
+    msg =
+      %q(The Redis Backend can only be used if Redis is installed.
+        This error is raised from 'lib/classifier-reborn/backends/bayes_redis_backend.rb'.
+        If you have encountered this error and would like to use the Redis Backend,
+        please run 'gem install redis' or include 'gem "redis"' in
+        your gemfile. For more info see https://github.com/jekyll/classifier-reborn#usage.
+      )
+    super(msg)
+  end
+end

data/lib/classifier-reborn/bayes.rb CHANGED Viewed

@@ -1,8 +1,17 @@
+# frozen_string_literal: true
 # Author::    Lucas Carlson  (mailto:lucas@rufy.com)
 # Copyright:: Copyright (c) 2005 Lucas Carlson
 # License::   LGPL
+require 'set'
+require_relative 'extensions/tokenizer/whitespace'
+require_relative 'extensions/token_filter/stopword'
+require_relative 'extensions/token_filter/stemmer'
 require_relative 'category_namer'
+require_relative 'backends/bayes_memory_backend'
+require_relative 'backends/bayes_redis_backend'
 module ClassifierReborn
   class Bayes
@@ -13,33 +22,46 @@ module ClassifierReborn
     #      b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
     #
     # Options available are:
-    #   language:         'en'   Used to select language specific stop words
-    #   auto_categorize:  false  When true, enables ability to dynamically declare a category
-    #   enable_threshold: false  When true, enables a threshold requirement for classifition
-    #   threshold:        0.0    Default threshold, only used when enabled
+    #   language:         'en'                    Used to select language specific stop words
+    #   auto_categorize:  false                   When true, enables ability to dynamically declare a category; the default is true if no initial categories are provided
+    #   enable_threshold: false                   When true, enables a threshold requirement for classifition
+    #   threshold:        0.0                     Default threshold, only used when enabled
+    #   enable_stemmer:   true                    When false, disables word stemming
+    #   stopwords:        nil                     Accepts path to a text file or an array of words, when supplied, overwrites the default stopwords; assign empty string or array to disable stopwords
+    #   backend:          BayesMemoryBackend.new  Alternatively, BayesRedisBackend.new for persistent storage
     def initialize(*args)
-      @categories = Hash.new
-      options = { language:         'en',
-                  auto_categorize:  false,
+      @initial_categories = []
+      options = { language: 'en',
                   enable_threshold: false,
-                  threshold:        0.0
-                }
-      args.flatten.each { |arg|
-        if arg.kind_of?(Hash)
+                  threshold: 0.0,
+                  enable_stemmer: true,
+                  backend: BayesMemoryBackend.new }
+      args.flatten.each do |arg|
+        if arg.is_a?(Hash)
           options.merge!(arg)
         else
-          add_category(arg)
+          @initial_categories.push(arg)
         end
-      }
+      end
-      @total_words         = 0
-      @category_counts     = Hash.new(0)
-      @category_word_count = Hash.new(0)
+      unless options.key?(:auto_categorize)
+        options[:auto_categorize] = @initial_categories.empty? ? true : false
+      end
       @language            = options[:language]
       @auto_categorize     = options[:auto_categorize]
       @enable_threshold    = options[:enable_threshold]
       @threshold           = options[:threshold]
+      @enable_stemmer      = options[:enable_stemmer]
+      @backend             = options[:backend]
+      @tokenizer           = options[:tokenizer] || Tokenizer::Whitespace
+      @token_filters       = options[:token_filters] || [TokenFilter::Stopword]
+      @token_filters << TokenFilter::Stemmer if @enable_stemmer && !@token_filters.include?(TokenFilter::Stemmer)
+      TokenFilter::Stopword.language = @language if @token_filters.include?(TokenFilter::Stopword)
+      populate_initial_categories
+      custom_stopwords options[:stopwords] if options.key?(:stopwords)
     end
     # Provides a general training method for all categories specified in Bayes#new
@@ -49,23 +71,28 @@ module ClassifierReborn
     #     b.train "that", "That text"
     #     b.train "The other", "The other text"
     def train(category, text)
+      word_hash = Hasher.word_hash(text, @enable_stemmer,
+                                   tokenizer: @tokenizer, token_filters: @token_filters)
+      return if word_hash.empty?
       category = CategoryNamer.prepare_name(category)
       # Add the category dynamically or raise an error
-      if !@categories.has_key?(category)
+      unless category_keys.include?(category)
         if @auto_categorize
           add_category(category)
         else
-          raise CategoryNotFoundError.new("Cannot train; category #{category} does not exist")
+          raise CategoryNotFoundError, "Cannot train; category #{category} does not exist"
         end
       end
-      @category_counts[category] += 1
-      Hasher.word_hash(text, @language).each do |word, count|
-        @categories[category][word]      +=     count
-        @category_word_count[category]   += count
-        @total_words += count
+      word_hash.each do |word, count|
+        @backend.update_category_word_frequency(category, word, count)
+        @backend.update_category_word_count(category, count)
+        @backend.update_total_words(count)
       end
+      @backend.update_total_trainings(1)
+      @backend.update_category_training_count(category, 1)
     end
     # Provides a untraining method for all categories specified in Bayes#new
@@ -76,23 +103,26 @@ module ClassifierReborn
     #     b.train :this, "This text"
     #     b.untrain :this, "This text"
     def untrain(category, text)
+      word_hash = Hasher.word_hash(text, @enable_stemmer,
+                                   tokenizer: @tokenizer, token_filters: @token_filters)
+      return if word_hash.empty?
       category = CategoryNamer.prepare_name(category)
-      @category_counts[category] -= 1
-      Hasher.word_hash(text, @language).each do |word, count|
-        if @total_words >= 0
-          orig = @categories[category][word] || 0
-          @categories[category][word]      -=     count
-          if @categories[category][word] <= 0
-            @categories[category].delete(word)
-            count = orig
-          end
-          if @category_word_count[category] >= count
-            @category_word_count[category] -= count
-          end
-          @total_words -= count
+      word_hash.each do |word, count|
+        next if @backend.total_words < 0
+        orig = @backend.category_word_frequency(category, word) || 0
+        @backend.update_category_word_frequency(category, word, -count)
+        if @backend.category_word_frequency(category, word) <= 0
+          @backend.delete_category_word(category, word)
+          count = orig
         end
+        @backend.update_category_word_count(category, -count) if @backend.category_word_count(category) >= count
+        @backend.update_total_words(-count)
       end
+      @backend.update_total_trainings(-1)
+      @backend.update_category_training_count(category, -1)
     end
     # Returns the scores in each category the provided +text+. E.g.,
@@ -100,21 +130,27 @@ module ClassifierReborn
     #    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
     # The largest of these scores (the one closest to 0) is the one picked out by #classify
     def classifications(text)
-      score = Hash.new
-      word_hash = Hasher.word_hash(text, @language)
-      training_count = @category_counts.values.reduce(:+).to_f
-      @categories.each do |category, category_words|
+      score = {}
+      word_hash = Hasher.word_hash(text, @enable_stemmer,
+                                   tokenizer: @tokenizer, token_filters: @token_filters)
+      if word_hash.empty?
+        category_keys.each do |category|
+          score[category.to_s] = Float::INFINITY
+        end
+        return score
+      end
+      category_keys.each do |category|
         score[category.to_s] = 0
-        total = (@category_word_count[category] || 1).to_f
-        word_hash.each do |word, count|
-          s = category_words.has_key?(word) ? category_words[word] : 0.1
-          score[category.to_s] += Math.log(s/total)
+        total = (@backend.category_word_count(category) || 1).to_f
+        word_hash.each do |word, _count|
+          s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1
+          score[category.to_s] += Math.log(s / total)
         end
         # now add prior probability for the category
-        s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
-        score[category.to_s] += Math.log(s / training_count)
+        s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1
+        score[category.to_s] += Math.log(s / @backend.total_trainings.to_f)
       end
-      return score
+      score
     end
     # Returns the classification of the provided +text+, which is one of the
@@ -128,21 +164,15 @@ module ClassifierReborn
     # Return the classification without the score
     def classify(text)
       result, score = classify_with_score(text)
-      if threshold_enabled?
-        result = nil if score < @threshold || score == Float::INFINITY
-      end
-      return result
+      result = nil if threshold_enabled? && (score < @threshold || score == Float::INFINITY)
+      result
     end
     # Retrieve the current threshold value
-    def threshold
-      @threshold
-    end
+    attr_reader :threshold
     # Dynamically set the threshold value
-    def threshold=(a_float)
-      @threshold = a_float
-    end
+    attr_writer :threshold
     # Dynamically enable threshold for classify results
     def enable_threshold
@@ -164,6 +194,16 @@ module ClassifierReborn
       !@enable_threshold
     end
+    # Is word stemming enabled?
+    def stemmer_enabled?
+      @enable_stemmer
+    end
+    # Is word stemming disabled?
+    def stemmer_disabled?
+      !@enable_stemmer
+    end
     # Provides training and untraining methods for the categories specified in Bayes#new
     # For example:
     #     b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
@@ -174,21 +214,29 @@ module ClassifierReborn
     def method_missing(name, *args)
       cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
       category = CategoryNamer.prepare_name(cleaned_name)
-      if @categories.has_key? category
-        args.each { |text| eval("#{$1}train(category, text)") }
+      if category_keys.include?(category)
+        args.each { |text| eval("#{Regexp.last_match(1)}train(category, text)") }
       elsif name.to_s =~ /(un)?train_([\w]+)/
         raise StandardError, "No such category: #{category}"
       else
-        super  #raise StandardError, "No such method: #{name}"
+        super # raise StandardError, "No such method: #{name}"
       end
     end
     # Provides a list of category names
     # For example:
     #     b.categories
-    #     =>   ['This', 'That', 'the_other']
-    def categories # :nodoc:
-      @categories.keys.collect {|c| c.to_s}
+    #     =>   ["This", "That", "The other"]
+    def categories
+      category_keys.collect(&:to_s)
+    end
+    # Provides a list of category keys as symbols
+    # For example:
+    #     b.categories
+    #     =>   [:This, :That, :"The other"]
+    def category_keys
+      @backend.category_keys
     end
     # Allows you to add categories to the classifier.
@@ -200,9 +248,37 @@ module ClassifierReborn
     # more criteria than the trained selective categories. In short,
     # try to initialize your categories at initialization.
     def add_category(category)
-      @categories[CategoryNamer.prepare_name(category)] ||= Hash.new(0)
+      category = CategoryNamer.prepare_name(category)
+      @backend.add_category(category)
     end
     alias append_category add_category
+    def reset
+      @backend.reset
+      populate_initial_categories
+    end
+    private
+    def populate_initial_categories
+      @initial_categories.each do |c|
+        add_category(c)
+      end
+    end
+    # Overwrites the default stopwords for current language with supplied list of stopwords or file
+    def custom_stopwords(stopwords)
+      unless stopwords.is_a?(Enumerable)
+        if stopwords.strip.empty?
+          stopwords = []
+        elsif File.exist?(stopwords)
+          stopwords = File.read(stopwords).force_encoding('utf-8').split
+        else
+          return # Do not overwrite the default
+        end
+      end
+      TokenFilter::Stopword::STOPWORDS[@language] = Set.new stopwords
+    end
   end
 end

data/lib/classifier-reborn/category_namer.rb CHANGED Viewed

@@ -1,17 +1,19 @@
+# frozen_string_literal: true
 # Author::    Lucas Carlson  (mailto:lucas@rufy.com)
 # Copyright:: Copyright (c) 2005 Lucas Carlson
 # License::   LGPL
-require 'fast_stemmer'
 require 'classifier-reborn/extensions/hasher'
 module ClassifierReborn
   module CategoryNamer
-    extend self
-    def prepare_name(name)
+    module_function
+    def prepare_name(name)
       return name if name.is_a?(Symbol)
-      name.to_s.gsub("_"," ").capitalize.intern
+      name.to_s.tr('_', ' ').capitalize.intern
     end
   end
 end

data/lib/classifier-reborn/extensions/hasher.rb CHANGED Viewed

@@ -1,59 +1,42 @@
-# encoding: utf-8
+# frozen_string_literal: true
 # Author::    Lucas Carlson  (mailto:lucas@rufy.com)
 # Copyright:: Copyright (c) 2005 Lucas Carlson
 # License::   LGPL
 require 'set'
+require_relative 'tokenizer/whitespace'
+require_relative 'token_filter/stopword'
+require_relative 'token_filter/stemmer'
 module ClassifierReborn
   module Hasher
-    STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
-    extend self
+    module_function
     # Return a Hash of strings => ints. Each word in the string is stemmed,
     # interned, and indexes to its frequency in the document.
-    def word_hash(str, language = 'en')
-      cleaned_word_hash = clean_word_hash(str, language)
-      symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
-      return cleaned_word_hash.merge(symbol_hash)
-    end
-    # Return a word hash without extra punctuation or short symbols, just stemmed words
-    def clean_word_hash(str, language = 'en')
-      word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
-    end
-    def word_hash_for_words(words, language = 'en')
-      d = Hash.new(0)
-      words.each do |word|
-        if word.length > 2 && !STOPWORDS[language].include?(word)
-          d[word.stem.intern] += 1
+    def word_hash(str, enable_stemmer = true,
+                  tokenizer: Tokenizer::Whitespace,
+                  token_filters: [TokenFilter::Stopword])
+      if token_filters.include?(TokenFilter::Stemmer)
+        unless enable_stemmer
+          token_filters.reject! do |token_filter|
+            token_filter == TokenFilter::Stemmer
+          end
         end
+      else
+        token_filters << TokenFilter::Stemmer if enable_stemmer
+      end
+      words = tokenizer.call(str)
+      token_filters.each do |token_filter|
+        words = token_filter.call(words)
       end
-      return d
-    end
-    def word_hash_for_symbols(words)
       d = Hash.new(0)
       words.each do |word|
         d[word.intern] += 1
       end
-      return d
-    end
-    # Create a lazily-loaded hash of stopword data
-    STOPWORDS = Hash.new do |hash, language|
-      hash[language] = []
-      STOPWORDS_PATH.each do |path|
-        if File.exist?(File.join(path, language))
-          hash[language] = Set.new File.read(File.join(path, language.to_s)).split
-          break
-        end
-      end
-      hash[language]
+      d
     end
   end
 end

data/lib/classifier-reborn/extensions/token_filter/stemmer.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# frozen_string_literal: true
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+module ClassifierReborn
+  module TokenFilter
+    # This filter converts given tokens to their stemmed versions.
+    module Stemmer
+      module_function
+      def call(tokens)
+        tokens.collect do |token|
+          if token.stemmable?
+            token.stem
+          else
+            token
+          end
+        end
+      end
+    end
+  end
+end