RubyGems - classifier-reborn - Versions diffs - 2.0.4 → 2.3.0 - Mend

classifier-reborn 2.0.4 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +5 -5
data/LICENSE +74 -1
data/README.markdown +57 -207
data/data/stopwords/ar +104 -0
data/data/stopwords/bn +362 -0
data/data/stopwords/hi +97 -0
data/data/stopwords/ja +43 -0
data/data/stopwords/ru +420 -0
data/data/stopwords/tr +199 -30
data/data/stopwords/vi +647 -0
data/data/stopwords/zh +125 -0
data/lib/classifier-reborn/backends/bayes_memory_backend.rb +77 -0
data/lib/classifier-reborn/backends/bayes_redis_backend.rb +109 -0
data/lib/classifier-reborn/backends/no_redis_error.rb +14 -0
data/lib/classifier-reborn/bayes.rb +141 -65
data/lib/classifier-reborn/category_namer.rb +6 -4
data/lib/classifier-reborn/extensions/hasher.rb +22 -39
data/lib/classifier-reborn/extensions/token_filter/stemmer.rb +24 -0
data/lib/classifier-reborn/extensions/token_filter/stopword.rb +48 -0
data/lib/classifier-reborn/extensions/token_filter/symbol.rb +20 -0
data/lib/classifier-reborn/extensions/tokenizer/token.rb +36 -0
data/lib/classifier-reborn/extensions/tokenizer/whitespace.rb +28 -0
data/lib/classifier-reborn/extensions/vector.rb +35 -28
data/lib/classifier-reborn/extensions/vector_serialize.rb +10 -10
data/lib/classifier-reborn/extensions/zero_vector.rb +7 -0
data/lib/classifier-reborn/lsi/cached_content_node.rb +6 -5
data/lib/classifier-reborn/lsi/content_node.rb +35 -25
data/lib/classifier-reborn/lsi/summarizer.rb +7 -5
data/lib/classifier-reborn/lsi/word_list.rb +5 -6
data/lib/classifier-reborn/lsi.rb +166 -94
data/lib/classifier-reborn/validators/classifier_validator.rb +170 -0
data/lib/classifier-reborn/version.rb +3 -1
data/lib/classifier-reborn.rb +12 -1
metadata +98 -17
data/bin/bayes.rb +0 -36
data/bin/summarize.rb +0 -16

data/lib/classifier-reborn/extensions/token_filter/stopword.rb ADDED Viewed

@@ -0,0 +1,48 @@
+# frozen_string_literal: true
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+module ClassifierReborn
+  module TokenFilter
+    # This filter removes stopwords in the language, from given tokens.
+    module Stopword
+      STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../../data/stopwords')]
+      @language = 'en'
+      module_function
+      def call(tokens)
+        tokens.reject do |token|
+          token.maybe_stopword? &&
+            (token.length <= 2 || STOPWORDS[@language].include?(token))
+        end
+      end
+      # Add custom path to a new stopword file created by user
+      def add_custom_stopword_path(path)
+        STOPWORDS_PATH.unshift(path)
+      end
+      # Create a lazily-loaded hash of stopword data
+      STOPWORDS = Hash.new do |hash, language|
+        hash[language] = []
+        STOPWORDS_PATH.each do |path|
+          if File.exist?(File.join(path, language))
+            hash[language] = Set.new File.read(File.join(path, language.to_s)).force_encoding('utf-8').split
+            break
+          end
+        end
+        hash[language]
+      end
+      # Changes the language of stopwords
+      def language=(language)
+        @language = language
+      end
+    end
+  end
+end

data/lib/classifier-reborn/extensions/token_filter/symbol.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# frozen_string_literal: true
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+module ClassifierReborn
+  module TokenFilter
+    # This filter removes symbol-only terms, from given tokens.
+    module Symbol
+      module_function
+      def call(tokens)
+        tokens.reject do |token|
+          /[^\s\p{WORD}]/ === token
+        end
+      end
+    end
+  end
+end

data/lib/classifier-reborn/extensions/tokenizer/token.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# frozen_string_literal: true
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+module ClassifierReborn
+  module Tokenizer
+    class Token < String
+      # The class can be created with one token string and extra attributes. E.g.,
+      #      t = ClassifierReborn::Tokenizer::Token.new 'Tokenize', stemmable: true, maybe_stopword: false
+      #
+      # Attributes available are:
+      #   stemmable:        true  Possibility that the token can be stemmed. This must be false for un-stemmable terms, otherwise this should be true.
+      #   maybe_stopword:   true  Possibility that the token is a stopword. This must be false for terms which never been stopword, otherwise this should be true.
+      def initialize(string, stemmable: true, maybe_stopword: true)
+        super(string)
+        @stemmable = stemmable
+        @maybe_stopword = maybe_stopword
+      end
+      def stemmable?
+        @stemmable
+      end
+      def maybe_stopword?
+        @maybe_stopword
+      end
+      def stem
+        stemmed = super
+        self.class.new(stemmed, stemmable: @stemmable, maybe_stopword: @maybe_stopword)
+      end
+    end
+  end
+end

data/lib/classifier-reborn/extensions/tokenizer/whitespace.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+require_relative 'token'
+module ClassifierReborn
+  module Tokenizer
+    # This tokenizes given input as white-space separated terms.
+    # It mainly aims to tokenize sentences written with a space between words, like English, French, and others.
+    module Whitespace
+      module_function
+      def call(str)
+        tokens = str.gsub(/[^\p{WORD}\s]/, '').downcase.split.collect do |word|
+          Token.new(word, stemmable: true, maybe_stopword: true)
+        end
+        symbol_tokens = str.scan(/[^\s\p{WORD}]/).collect do |word|
+          Token.new(word, stemmable: false, maybe_stopword: false)
+        end
+        tokens += symbol_tokens
+        tokens
+      end
+    end
+  end
+end

data/lib/classifier-reborn/extensions/vector.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 # Author::    Ernest Ellingson
 # Copyright:: Copyright (c) 2005
@@ -6,73 +8,78 @@
 require 'matrix'
 class Matrix
-  def Matrix.diag(s)
-     Matrix.diagonal(*s)
+  def self.diag(s)
+    Matrix.diagonal(*s)
   end
-  alias :trans :transpose
+  alias trans transpose
   def SV_decomp(maxSweeps = 20)
-    if self.row_size >= self.column_size
-      q = self.trans * self
-    else
-      q = self * self.trans
-    end
+    q = if row_size >= column_size
+          trans * self
+        else
+          self * trans
+        end
     qrot    = q.dup
     v       = Matrix.identity(q.row_size)
     mzrot   = nil
     cnt     = 0
     s_old   = nil
-    mu      = nil
-    while true do
+    loop do
       cnt += 1
-      for row in (0...qrot.row_size-1) do
-        for col in (1..qrot.row_size-1) do
+      (0...qrot.row_size - 1).each do |row|
+        (1..qrot.row_size - 1).each do |col|
           next if row == col
-          h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
+          h = if (2.0 * qrot[row, col]) == (qrot[row, row] - qrot[col, col])
+                Math.atan(1) / 2.0
+              else
+                Math.atan((2.0 * qrot[row, col]) / (qrot[row, row] - qrot[col, col])) / 2.0
+              end
           hcos = Math.cos(h)
           hsin = Math.sin(h)
           mzrot = Matrix.identity(qrot.row_size)
-          mzrot[row,row] = hcos
-          mzrot[row,col] = -hsin
-          mzrot[col,row] = hsin
-          mzrot[col,col] = hcos
+          mzrot[row, row] = hcos
+          mzrot[row, col] = -hsin
+          mzrot[col, row] = hsin
+          mzrot[col, col] = hcos
           qrot = mzrot.trans * qrot * mzrot
-          v = v * mzrot
+          v *= mzrot
         end
       end
       s_old = qrot.dup if cnt == 1
       sum_qrot = 0.0
       if cnt > 1
         qrot.row_size.times do |r|
-          sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
+          sum_qrot += (qrot[r, r] - s_old[r, r]).abs if (qrot[r, r] - s_old[r, r]).abs > 0.001
         end
         s_old = qrot.dup
       end
-      break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
+      break if (sum_qrot <= 0.001 && cnt > 1) || cnt >= maxSweeps
     end # of do while true
     s = []
     qrot.row_size.times do |r|
-      s << Math.sqrt(qrot[r,r])
+      s << Math.sqrt(qrot[r, r])
     end
-    #puts "cnt = #{cnt}"
-    if self.row_size >= self.column_size
-      mu = self *  v * Matrix.diagonal(*s).inverse
+    # puts "cnt = #{cnt}"
+    if row_size >= column_size
+      mu = self * v * Matrix.diagonal(*s).inverse
       return [mu, v, s]
     else
       puts v.row_size
       puts v.column_size
-      puts self.row_size
-      puts self.column_size
+      puts row_size
+      puts column_size
       puts s.size
-      mu = (self.trans * v *  Matrix.diagonal(*s).inverse)
+      mu = (trans * v * Matrix.diagonal(*s).inverse)
       return [mu, v, s]
     end
   end
-  def []=(i,j,val)
+  def []=(i, j, val)
     @rows[i][j] = val
   end
 end

data/lib/classifier-reborn/extensions/vector_serialize.rb CHANGED Viewed

@@ -1,20 +1,20 @@
+# frozen_string_literal: true
 module GSL
   class Vector
-    def _dump(v)
-      Marshal.dump( self.to_a )
+    def _dump(_v)
+      Marshal.dump(to_a)
     end
     def self._load(arr)
       arry = Marshal.load(arr)
-      return GSL::Vector.alloc(arry)
+      GSL::Vector.alloc(arry)
     end
   end
   class Matrix
-     class <<self
-        alias :diag :diagonal
-     end
+    class <<self
+       alias diag diagonal
+    end
   end
 end

data/lib/classifier-reborn/extensions/zero_vector.rb ADDED Viewed

@@ -0,0 +1,7 @@
+# frozen_string_literal: true
+class Vector
+  def zero?
+    all?(&:zero?)
+  end
+end

data/lib/classifier-reborn/lsi/cached_content_node.rb CHANGED Viewed

@@ -1,9 +1,10 @@
+# frozen_string_literal: true
 # Author::    Kelley Reynolds  (mailto:kelley@insidesystems.net)
 # Copyright:: Copyright (c) 2015 Kelley Reynolds
 # License::   LGPL
 module ClassifierReborn
   # Subclass of ContentNode which caches the search_vector transpositions.
   # Its great because its much faster for large indexes, but at the cost of more ram. Additionally,
   # if you Marshal your classifier and want to keep the size down, you'll need to manually
@@ -16,7 +17,7 @@ module ClassifierReborn
       end
     end
-    def initialize( word_hash, *categories )
+    def initialize(word_hash, *categories)
       clear_cache!
       super
     end
@@ -29,13 +30,13 @@ module ClassifierReborn
     def transposed_search_vector
       @transposed_search_vector ||= super
     end
     # Clear the cache before we continue on
-    def raw_vector_with( word_list )
+    def raw_vector_with(word_list)
       clear_cache!
       super
     end
     # We don't want the cached_data here
     def marshal_dump
       [@lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash]

data/lib/classifier-reborn/lsi/content_node.rb CHANGED Viewed

@@ -1,12 +1,13 @@
+# frozen_string_literal: true
 # Author::    David Fayram  (mailto:dfayram@lensmen.net)
 # Copyright:: Copyright (c) 2005 David Fayram II
 # License::   LGPL
 module ClassifierReborn
-# This is an internal data structure class for the LSI node. Save for
-# raw_vector_with, it should be fairly straightforward to understand.
-# You should never have to use it directly.
+  # This is an internal data structure class for the LSI node. Save for
+  # raw_vector_with, it should be fairly straightforward to understand.
+  # You should never have to use it directly.
   class ContentNode
     attr_accessor :raw_vector, :raw_norm,
                   :lsi_vector, :lsi_norm,
@@ -15,7 +16,7 @@ module ClassifierReborn
     attr_reader :word_hash
     # If text_proc is not specified, the source will be duck-typed
     # via source.to_s
-    def initialize( word_hash, *categories )
+    def initialize(word_hash, *categories)
       @categories = categories || []
       @word_hash = word_hash
       @lsi_norm, @lsi_vector = nil
@@ -28,7 +29,11 @@ module ClassifierReborn
     # Method to access the transposed search vector
     def transposed_search_vector
-      search_vector.col
+      if $SVD == :numo
+        search_vector
+      else
+        search_vector.col
+      end
     end
     # Use this to fetch the appropriate search vector in normalized form.
@@ -38,21 +43,25 @@ module ClassifierReborn
     # Creates the raw vector out of word_hash using word_list as the
     # key for mapping the vector space.
-    def raw_vector_with( word_list )
-      if $GSL
-         vec = GSL::Vector.alloc(word_list.size)
-      else
-         vec = Array.new(word_list.size, 0)
-      end
+    def raw_vector_with(word_list)
+      vec = if $SVD == :numo
+              Numo::DFloat.zeros(word_list.size)
+            elsif $SVD == :gsl
+              GSL::Vector.alloc(word_list.size)
+            else
+              Array.new(word_list.size, 0)
+            end
       @word_hash.each_key do |word|
         vec[word_list[word]] = @word_hash[word] if word_list[word]
       end
       # Perform the scaling transform and force floating point arithmetic
-      if $GSL
+      if $SVD == :numo
+        total_words = vec.sum.to_f
+      elsif $SVD == :gsl
         sum = 0.0
-        vec.each {|v| sum += v }
+        vec.each { |v| sum += v }
         total_words = sum
       else
         total_words = vec.reduce(0, :+).to_f
@@ -60,10 +69,10 @@ module ClassifierReborn
       total_unique_words = 0
-      if $GSL
+      if [:numo, :gsl].include?($SVD)
         vec.each { |word| total_unique_words += 1 if word != 0.0 }
       else
-        total_unique_words = vec.count{ |word| word != 0 }
+        total_unique_words = vec.count { |word| word != 0 }
       end
       # Perform first-order association transform if this vector has more
@@ -71,9 +80,9 @@ module ClassifierReborn
       if total_words > 1.0 && total_unique_words > 1
         weighted_total = 0.0
         # Cache calculations, this takes too long on large indexes
-        cached_calcs = Hash.new { |hash, term|
-          hash[term] = (( term / total_words ) * Math.log( term / total_words ))
-        }
+        cached_calcs = Hash.new do |hash, term|
+          hash[term] = ((term / total_words) * Math.log(term / total_words))
+        end
         vec.each do |term|
           weighted_total += cached_calcs[term] if term > 0.0
@@ -81,15 +90,18 @@ module ClassifierReborn
         # Cache calculations, this takes too long on large indexes
         cached_calcs = Hash.new do |hash, val|
-          hash[val] = Math.log( val + 1 ) / -weighted_total
+          hash[val] = Math.log(val + 1) / -weighted_total
         end
-        vec.collect! { |val|
+        vec = vec.map do |val|
           cached_calcs[val]
-        }
+        end
       end
-      if $GSL
+      if $SVD == :numo
+        @raw_norm   = vec / Numo::Linalg.norm(vec)
+        @raw_vector = vec
+      elsif $SVD == :gsl
         @raw_norm   = vec.normalize
         @raw_vector = vec
       else
@@ -97,7 +109,5 @@ module ClassifierReborn
         @raw_vector = Vector[*vec]
       end
     end
   end
 end

data/lib/classifier-reborn/lsi/summarizer.rb CHANGED Viewed

@@ -1,16 +1,18 @@
+# frozen_string_literal: true
 # Author::    Lucas Carlson  (mailto:lucas@rufy.com)
 # Copyright:: Copyright (c) 2005 Lucas Carlson
 # License::   LGPL
 module ClassifierReborn
   module Summarizer
-    extend self
+    module_function
-    def summary( str, count=10, separator=" [...] " )
+    def summary(str, count = 10, separator = ' [...] ')
       perform_lsi split_sentences(str), count, separator
     end
-    def paragraph_summary( str, count=1, separator=" [...] " )
+    def paragraph_summary(str, count = 1, separator = ' [...] ')
       perform_lsi split_paragraphs(str), count, separator
     end
@@ -23,11 +25,11 @@ module ClassifierReborn
     end
     def perform_lsi(chunks, count, separator)
-      lsi = ClassifierReborn::LSI.new :auto_rebuild => false
+      lsi = ClassifierReborn::LSI.new auto_rebuild: false
       chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
       lsi.build_index
       summaries = lsi.highest_relative_content count
-      return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
+      summaries.select { |chunk| summaries.include? chunk }.map(&:strip).join(separator)
     end
   end
 end

data/lib/classifier-reborn/lsi/word_list.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 # Author::    David Fayram  (mailto:dfayram@lensmen.net)
 # Copyright:: Copyright (c) 2005 David Fayram II
 # License::   LGPL
@@ -8,19 +10,17 @@ module ClassifierReborn
   class WordList
     def initialize
-      @location_table = Hash.new
+      @location_table = {}
     end
     # Adds a word (if it is new) and assigns it a unique dimension.
     def add_word(word)
-      term = word
-      @location_table[term] = @location_table.size unless @location_table[term]
+      @location_table[word] = @location_table.size unless @location_table[word]
     end
     # Returns the dimension of the word or nil if the word is not in the space.
     def [](lookup)
-      term = lookup
-      @location_table[term]
+      @location_table[lookup]
     end
     def word_for_index(ind)
@@ -31,6 +31,5 @@ module ClassifierReborn
     def size
       @location_table.size
     end
   end
 end