RubyGems - classifier-reborn - Versions diffs - 2.0.4 → 2.3.0 - Mend

classifier-reborn 2.0.4 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +5 -5
data/LICENSE +74 -1
data/README.markdown +57 -207
data/data/stopwords/ar +104 -0
data/data/stopwords/bn +362 -0
data/data/stopwords/hi +97 -0
data/data/stopwords/ja +43 -0
data/data/stopwords/ru +420 -0
data/data/stopwords/tr +199 -30
data/data/stopwords/vi +647 -0
data/data/stopwords/zh +125 -0
data/lib/classifier-reborn/backends/bayes_memory_backend.rb +77 -0
data/lib/classifier-reborn/backends/bayes_redis_backend.rb +109 -0
data/lib/classifier-reborn/backends/no_redis_error.rb +14 -0
data/lib/classifier-reborn/bayes.rb +141 -65
data/lib/classifier-reborn/category_namer.rb +6 -4
data/lib/classifier-reborn/extensions/hasher.rb +22 -39
data/lib/classifier-reborn/extensions/token_filter/stemmer.rb +24 -0
data/lib/classifier-reborn/extensions/token_filter/stopword.rb +48 -0
data/lib/classifier-reborn/extensions/token_filter/symbol.rb +20 -0
data/lib/classifier-reborn/extensions/tokenizer/token.rb +36 -0
data/lib/classifier-reborn/extensions/tokenizer/whitespace.rb +28 -0
data/lib/classifier-reborn/extensions/vector.rb +35 -28
data/lib/classifier-reborn/extensions/vector_serialize.rb +10 -10
data/lib/classifier-reborn/extensions/zero_vector.rb +7 -0
data/lib/classifier-reborn/lsi/cached_content_node.rb +6 -5
data/lib/classifier-reborn/lsi/content_node.rb +35 -25
data/lib/classifier-reborn/lsi/summarizer.rb +7 -5
data/lib/classifier-reborn/lsi/word_list.rb +5 -6
data/lib/classifier-reborn/lsi.rb +166 -94
data/lib/classifier-reborn/validators/classifier_validator.rb +170 -0
data/lib/classifier-reborn/version.rb +3 -1
data/lib/classifier-reborn.rb +12 -1
metadata +98 -17
data/bin/bayes.rb +0 -36
data/bin/summarize.rb +0 -16

data/lib/classifier-reborn/lsi.rb CHANGED Viewed

@@ -1,30 +1,45 @@
+# frozen_string_literal: true
 # Author::    David Fayram  (mailto:dfayram@lensmen.net)
 # Copyright:: Copyright (c) 2005 David Fayram II
 # License::   LGPL
+# Try to load Numo first - it's the most current and the most well-supported.
+# Fall back to GSL.
+# Fall back to native vector.
 begin
-  raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
-  require 'gsl' # requires http://rb-gsl.rubyforge.org/
-  require_relative 'extensions/vector_serialize'
-  $GSL = true
+  raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true`
+  raise LoadError if ENV['GSL'] == 'true' # to test with gsl, try `rake test GSL=true`
+  require 'numo/narray' # https://ruby-numo.github.io/narray/
+  require 'numo/linalg' # https://ruby-numo.github.io/linalg/
+  $SVD = :numo
 rescue LoadError
-  require_relative 'extensions/vector'
+  begin
+    raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true`
+    require 'gsl' # requires https://github.com/SciRuby/rb-gsl
+    require_relative 'extensions/vector_serialize'
+    $SVD = :gsl
+  rescue LoadError
+    $SVD = :ruby
+    require_relative 'extensions/vector'
+    require_relative 'extensions/zero_vector'
+  end
 end
 require_relative 'lsi/word_list'
 require_relative 'lsi/content_node'
 require_relative 'lsi/cached_content_node'
 require_relative 'lsi/summarizer'
+require_relative 'extensions/token_filter/stopword'
+require_relative 'extensions/token_filter/symbol'
 module ClassifierReborn
   # This class implements a Latent Semantic Indexer, which can search, classify and cluster
   # data based on underlying semantic relations. For more information on the algorithms used,
   # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
   class LSI
     attr_reader :word_list, :cache_node_vectors
     attr_accessor :auto_rebuild
@@ -36,12 +51,17 @@ module ClassifierReborn
     #
     def initialize(options = {})
       @auto_rebuild = options[:auto_rebuild] != false
-      @word_list, @items = WordList.new, {}
-      @version, @built_at_version = 0, -1
+      @word_list = WordList.new
+      @items = {}
+      @version = 0
+      @built_at_version = -1
       @language = options[:language] || 'en'
-      if @cache_node_vectors = options[:cache_node_vectors]
-        extend CachedContentNode::InstanceMethods
-      end
+      @token_filters = [
+        TokenFilter::Stopword,
+        TokenFilter::Symbol
+      ]
+      TokenFilter::Stopword.language = @language
+      extend CachedContentNode::InstanceMethods if @cache_node_vectors = options[:cache_node_vectors]
     end
     # Returns true if the index needs to be rebuilt.  The index needs
@@ -64,39 +84,45 @@ module ClassifierReborn
     #   ar = ActiveRecordObject.find( :all )
     #   lsi.add_item ar, *ar.categories { |x| ar.content }
     #
-    def add_item( item, *categories, &block )
-      clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
-      @items[item] = if @cache_node_vectors
-        CachedContentNode.new(clean_word_hash, *categories)
+    def add_item(item, *categories, &block)
+      clean_word_hash = Hasher.word_hash((block ? yield(item) : item.to_s),
+                                         token_filters: @token_filters)
+      if clean_word_hash.empty?
+        puts "Input: '#{item}' is entirely stopwords or words with 2 or fewer characters. Classifier-Reborn cannot handle this document properly."
       else
-        ContentNode.new(clean_word_hash, *categories)
+        @items[item] = if @cache_node_vectors
+                         CachedContentNode.new(clean_word_hash, *categories)
+                       else
+                         ContentNode.new(clean_word_hash, *categories)
+                       end
+        @version += 1
+        build_index if @auto_rebuild
       end
-      @version += 1
-      build_index if @auto_rebuild
     end
     # A less flexible shorthand for add_item that assumes
     # you are passing in a string with no categorries. item
     # will be duck typed via to_s .
     #
-    def <<( item )
-      add_item item
+    def <<(item)
+      add_item(item)
     end
-    # Returns the categories for a given indexed items. You are free to add and remove
+    # Returns categories for a given indexed item. You are free to add and remove
     # items from this as you see fit. It does not invalide an index to change its categories.
     def categories_for(item)
       return [] unless @items[item]
-      return @items[item].categories
+      @items[item].categories
     end
     # Removes an item from the database, if it is indexed.
     #
-    def remove_item( item )
-      if @items.key? item
-        @items.delete item
-        @version += 1
-      end
+    def remove_item(item)
+      return unless @items.key? item
+      @items.delete item
+      @version += 1
     end
     # Returns an array of items that are indexed.
@@ -118,30 +144,43 @@ module ClassifierReborn
     # cutoff parameter tells the indexer how many of these values to keep.
     # A value of 1 for cutoff means that no semantic analysis will take place,
     # turning the LSI class into a simple vector search engine.
-    def build_index( cutoff=0.75 )
+    def build_index(cutoff = 0.75)
       return unless needs_rebuild?
       make_word_list
       doc_list = @items.values
-      tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
+      tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
-      if $GSL
-         tdm = GSL::Matrix.alloc(*tda).trans
-         ntdm = build_reduced_matrix(tdm, cutoff)
+      if $SVD == :numo
+        tdm = Numo::NArray.asarray(tda.map(&:to_a)).transpose
+        ntdm = numo_build_reduced_matrix(tdm, cutoff)
-         ntdm.size[1].times do |col|
-           vec = GSL::Vector.alloc( ntdm.column(col) ).row
-           doc_list[col].lsi_vector = vec
-           doc_list[col].lsi_norm = vec.normalize
-         end
+        ntdm.each_over_axis(1).with_index do |col_vec, i|
+          doc_list[i].lsi_vector = col_vec
+          doc_list[i].lsi_norm = col_vec / Numo::Linalg.norm(col_vec)
+        end
+      elsif $SVD == :gsl
+        tdm = GSL::Matrix.alloc(*tda).trans
+        ntdm = build_reduced_matrix(tdm, cutoff)
+        ntdm.size[1].times do |col|
+          vec = GSL::Vector.alloc(ntdm.column(col)).row
+          doc_list[col].lsi_vector = vec
+          doc_list[col].lsi_norm = vec.normalize
+        end
       else
-         tdm = Matrix.rows(tda).trans
-         ntdm = build_reduced_matrix(tdm, cutoff)
+        tdm = Matrix.rows(tda).trans
+        ntdm = build_reduced_matrix(tdm, cutoff)
-         ntdm.row_size.times do |col|
-           doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
-           doc_list[col].lsi_norm = ntdm.column(col).normalize  if doc_list[col]
-         end
+        ntdm.column_size.times do |col|
+          doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
+          if ntdm.column(col).zero?
+            doc_list[col].lsi_norm = ntdm.column(col) if doc_list[col]
+          else
+            doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
+          end
+        end
       end
       @built_at_version = @version
@@ -155,13 +194,13 @@ module ClassifierReborn
     # your dataset's general content. For example, if you were to use categorize on the
     # results of this data, you could gather information on what your dataset is generally
     # about.
-    def highest_relative_content( max_chunks=10 )
-       return [] if needs_rebuild?
+    def highest_relative_content(max_chunks = 10)
+      return [] if needs_rebuild?
-       avg_density = Hash.new
-       @items.each_key { |item| avg_density[item] = proximity_array_for_content(item).inject(0.0) { |x,y| x + y[1]} }
+      avg_density = {}
+      @items.each_key { |item| avg_density[item] = proximity_array_for_content(item).inject(0.0) { |x, y| x + y[1] } }
-       avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
+      avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks - 1].map
     end
     # This function is the primitive that find_related and classify
@@ -176,17 +215,19 @@ module ClassifierReborn
     # The parameter doc is the content to compare. If that content is not
     # indexed, you can pass an optional block to define how to create the
     # text data. See add_item for examples of how this works.
-    def proximity_array_for_content( doc, &block )
+    def proximity_array_for_content(doc, &block)
       return [] if needs_rebuild?
-      content_node = node_for_content( doc, &block )
+      content_node = node_for_content(doc, &block)
       result =
         @items.keys.collect do |item|
-          if $GSL
-            val = content_node.search_vector * @items[item].transposed_search_vector
-          else
-            val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
-          end
+          val = if $SVD == :numo
+                  content_node.search_vector.dot(@items[item].transposed_search_vector)
+                elsif $SVD == :gsl
+                  content_node.search_vector * @items[item].transposed_search_vector
+                else
+                  (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
+                end
           [item, val]
         end
       result.sort_by { |x| x[1] }.reverse
@@ -197,17 +238,28 @@ module ClassifierReborn
     # calculated vectors instead of their full versions. This is useful when
     # you're trying to perform operations on content that is much smaller than
     # the text you're working with. search uses this primitive.
-    def proximity_norms_for_content( doc, &block )
+    def proximity_norms_for_content(doc, &block)
       return [] if needs_rebuild?
-      content_node = node_for_content( doc, &block )
+      content_node = node_for_content(doc, &block)
+      if ($SVD == :gsl && content_node.raw_norm.isnan?.all?) ||
+          ($SVD == :numo && content_node.raw_norm.isnan.all?)
+        puts "There are no documents that are similar to #{doc}"
+      else
+        content_node_norms(content_node)
+      end
+    end
+    def content_node_norms(content_node)
       result =
         @items.keys.collect do |item|
-          if $GSL
-            val = content_node.search_norm * @items[item].search_norm.col
-          else
-            val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
-          end
+          val = if $SVD == :numo
+                  content_node.search_norm.dot(@items[item].search_norm)
+                elsif $SVD == :gsl
+                  content_node.search_norm * @items[item].search_norm.col
+                else
+                  (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
+                end
           [item, val]
         end
       result.sort_by { |x| x[1] }.reverse
@@ -220,11 +272,14 @@ module ClassifierReborn
     #
     # While this may seem backwards compared to the other functions that LSI supports,
     # it is actually the same algorithm, just applied on a smaller document.
-    def search( string, max_nearest=3 )
+    def search(string, max_nearest = 3)
       return [] if needs_rebuild?
-      carry = proximity_norms_for_content( string )
-      result = carry.collect { |x| x[0] }
-      return result[0..max_nearest-1]
+      carry = proximity_norms_for_content(string)
+      unless carry.nil?
+        result = carry.collect { |x| x[0] }
+        result[0..max_nearest - 1]
+      end
     end
     # This function takes content and finds other documents
@@ -236,21 +291,21 @@ module ClassifierReborn
     # This is particularly useful for identifing clusters in your document space.
     # For example you may want to identify several "What's Related" items for weblog
     # articles, or find paragraphs that relate to each other in an essay.
-    def find_related( doc, max_nearest=3, &block )
+    def find_related(doc, max_nearest = 3, &block)
       carry =
-        proximity_array_for_content( doc, &block ).reject { |pair| pair[0].eql? doc }
+        proximity_array_for_content(doc, &block).reject { |pair| pair[0].eql? doc }
       result = carry.collect { |x| x[0] }
-      return result[0..max_nearest-1]
+      result[0..max_nearest - 1]
     end
     # Return the most obvious category with the score
-    def classify_with_score( doc, cutoff=0.30, &block)
-      return scored_categories(doc, cutoff, &block).last
+    def classify_with_score(doc, cutoff = 0.30, &block)
+      scored_categories(doc, cutoff, &block).last
     end
     # Return the most obvious category without the score
-    def classify( doc, cutoff=0.30, &block )
-      return scored_categories(doc, cutoff, &block).last.first
+    def classify(doc, cutoff = 0.30, &block)
+      scored_categories(doc, cutoff, &block).last.first
     end
     # This function uses a voting system to categorize documents, based on
@@ -262,10 +317,10 @@ module ClassifierReborn
     # text. A cutoff of 1 means that every document in the index votes on
     # what category the document is in. This may not always make sense.
     #
-    def scored_categories( doc, cutoff=0.30, &block )
+    def scored_categories(doc, cutoff = 0.30, &block)
       icutoff = (@items.size * cutoff).round
-      carry = proximity_array_for_content( doc, &block )
-      carry = carry[0..icutoff-1]
+      carry = proximity_array_for_content(doc, &block)
+      carry = carry[0..icutoff - 1]
       votes = Hash.new(0.0)
       carry.each do |pair|
         @items[pair[0]].categories.each do |category|
@@ -273,56 +328,73 @@ module ClassifierReborn
         end
       end
-      return votes.sort_by { |_, score| score }
+      votes.sort_by { |_, score| score }
     end
     # Prototype, only works on indexed documents.
     # I have no clue if this is going to work, but in theory
     # it's supposed to.
-    def highest_ranked_stems( doc, count=3 )
-      raise "Requested stem ranking on non-indexed content!" unless @items[doc]
-      arr = node_for_content(doc).lsi_vector.to_a
-      top_n = arr.sort.reverse[0..count-1]
-      return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
+    def highest_ranked_stems(doc, count = 3)
+      raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
+      content_vector_array = node_for_content(doc).lsi_vector.to_a
+      top_n = content_vector_array.sort.reverse[0..count - 1]
+      top_n.collect { |x| @word_list.word_for_index(content_vector_array.index(x)) }
+    end
+    def reset
+      initialize(auto_rebuild: @auto_rebuild, cache_node_vectors: @cache_node_vectors)
     end
     private
-    def build_reduced_matrix( matrix, cutoff=0.75 )
+    def build_reduced_matrix(matrix, cutoff = 0.75)
       # TODO: Check that M>=N on these dimensions! Transpose helps assure this
       u, v, s = matrix.SV_decomp
       # TODO: Better than 75% term, please. :\
       s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
       s.size.times do |ord|
         s[ord] = 0.0 if s[ord] < s_cutoff
       end
       # Reconstruct the term document matrix, only with reduced rank
-      u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
+      u * ($SVD == :gsl ? GSL::Matrix : ::Matrix).diag(s) * v.trans
+    end
+    def numo_build_reduced_matrix(matrix, cutoff = 0.75)
+      s, u, vt = Numo::Linalg.svd(matrix, driver: 'svd', job: 'S')
+      # TODO: Better than 75% term (as above)
+      s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
+      s.size.times do |ord|
+        s[ord] = 0.0 if s[ord] < s_cutoff
+      end
+      # Reconstruct the term document matrix, only with reduced rank
+      u.dot(::Numo::DFloat.eye(s.size) * s).dot(vt)
     end
     def node_for_content(item, &block)
       if @items[item]
         return @items[item]
       else
-        clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
+        clean_word_hash = Hasher.word_hash((block ? yield(item) : item.to_s),
+                                           token_filters: @token_filters)
-        cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
+        content_node = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
         unless needs_rebuild?
-          cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
+          content_node.raw_vector_with(@word_list) # make the lsi raw and norm vectors
         end
       end
-      return cn
+      content_node
     end
     def make_word_list
       @word_list = WordList.new
       @items.each_value do |node|
-        node.word_hash.each_key { |key| @word_list.add_word key }
+        node.word_hash.each_key { |key| @word_list.add_word(key) }
       end
     end
   end
 end

data/lib/classifier-reborn/validators/classifier_validator.rb ADDED Viewed

@@ -0,0 +1,170 @@
+# frozen_string_literal: true
+module ClassifierReborn
+  module ClassifierValidator
+    module_function
+    def cross_validate(classifier, sample_data, fold = 10, *options)
+      classifier = ClassifierReborn.const_get(classifier).new(options) if classifier.is_a?(String)
+      sample_data.shuffle!
+      partition_size = sample_data.length / fold
+      partitioned_data = sample_data.each_slice(partition_size)
+      conf_mats = []
+      fold.times do |i|
+        training_data = partitioned_data.take(fold)
+        test_data = training_data.slice!(i)
+        conf_mats << validate(classifier, training_data.flatten!(1), test_data)
+      end
+      classifier.reset
+      generate_report(conf_mats)
+    end
+    def validate(classifier, training_data, test_data, *options)
+      classifier = ClassifierReborn.const_get(classifier).new(options) if classifier.is_a?(String)
+      classifier.reset
+      training_data.each do |rec|
+        classifier.train(rec.first, rec.last)
+      end
+      evaluate(classifier, test_data)
+    end
+    def evaluate(classifier, test_data)
+      conf_mat = empty_conf_mat(classifier.categories.sort)
+      test_data.each do |rec|
+        actual = rec.first.tr('_', ' ').capitalize
+        predicted = classifier.classify(rec.last)
+        conf_mat[actual][predicted] += 1 unless predicted.nil?
+      end
+      conf_mat
+    end
+    def generate_report(*conf_mats)
+      conf_mats.flatten!
+      accumulated_conf_mat = conf_mats.length == 1 ? conf_mats.first : empty_conf_mat(conf_mats.first.keys.sort)
+      header = 'Run     Total   Correct Incorrect  Accuracy'
+      puts
+      puts ' Run Report '.center(header.length, '-')
+      puts header
+      puts '-' * header.length
+      if conf_mats.length > 1
+        conf_mats.each_with_index do |conf_mat, i|
+          run_report = build_run_report(conf_mat)
+          print_run_report(run_report, i + 1)
+          conf_mat.each do |actual, cols|
+            cols.each do |predicted, v|
+              accumulated_conf_mat[actual][predicted] += v
+            end
+          end
+        end
+        puts '-' * header.length
+      end
+      run_report = build_run_report(accumulated_conf_mat)
+      print_run_report(run_report, 'All')
+      puts
+      print_conf_mat(accumulated_conf_mat)
+      puts
+      conf_tab = conf_mat_to_tab(accumulated_conf_mat)
+      print_conf_tab(conf_tab)
+    end
+    def build_run_report(conf_mat)
+      correct = incorrect = 0
+      conf_mat.each do |actual, cols|
+        cols.each do |predicted, v|
+          if actual == predicted
+            correct += v
+          else
+            incorrect += v
+          end
+        end
+      end
+      total = correct + incorrect
+      { total: total, correct: correct, incorrect: incorrect, accuracy: divide(correct, total) }
+    end
+    def conf_mat_to_tab(conf_mat)
+      conf_tab = Hash.new { |h, k| h[k] = { p: { t: 0, f: 0 }, n: { t: 0, f: 0 } } }
+      conf_mat.each_key do |positive|
+        conf_mat.each do |actual, cols|
+          cols.each do |predicted, v|
+            conf_tab[positive][positive == predicted ? :p : :n][actual == predicted ? :t : :f] += v
+          end
+        end
+      end
+      conf_tab
+    end
+    def print_run_report(stats, prefix = '', print_header = false)
+      puts "#{'Run'.rjust([3, prefix.length].max)}     Total   Correct Incorrect  Accuracy" if print_header
+      puts "#{prefix.to_s.rjust(3)} #{stats[:total].to_s.rjust(9)} #{stats[:correct].to_s.rjust(9)} #{stats[:incorrect].to_s.rjust(9)} #{stats[:accuracy].round(5).to_s.ljust(7, '0').rjust(9)}"
+    end
+    def print_conf_mat(conf_mat)
+      header = ['Predicted ->'] + conf_mat.keys + %w[Total Recall]
+      cell_size = header.map(&:length).max
+      header = header.map { |h| h.rjust(cell_size) }.join(' ')
+      puts ' Confusion Matrix '.center(header.length, '-')
+      puts header
+      puts '-' * header.length
+      predicted_totals = conf_mat.keys.map { |predicted| [predicted, 0] }.to_h
+      correct = 0
+      conf_mat.each do |k, rec|
+        actual_total = rec.values.reduce(:+)
+        puts ([k.ljust(cell_size)] + rec.values.map { |v| v.to_s.rjust(cell_size) } + [actual_total.to_s.rjust(cell_size), divide(rec[k], actual_total).round(5).to_s.rjust(cell_size)]).join(' ')
+        rec.each do |cat, val|
+          predicted_totals[cat] += val
+          correct += val if cat == k
+        end
+      end
+      total = predicted_totals.values.reduce(:+)
+      puts '-' * header.length
+      puts (['Total'.ljust(cell_size)] + predicted_totals.values.map { |v| v.to_s.rjust(cell_size) } + [total.to_s.rjust(cell_size), ''.rjust(cell_size)]).join(' ')
+      puts (['Precision'.ljust(cell_size)] + predicted_totals.keys.map { |k| divide(conf_mat[k][k], predicted_totals[k]).round(5).to_s.rjust(cell_size) } + ['Accuracy ->'.rjust(cell_size), divide(correct, total).round(5).to_s.rjust(cell_size)]).join(' ')
+    end
+    def print_conf_tab(conf_tab)
+      conf_tab.each do |positive, tab|
+        puts "# Positive class: #{positive}"
+        derivations = conf_tab_derivations(tab)
+        print_derivations(derivations)
+        puts
+      end
+    end
+    def conf_tab_derivations(tab)
+      positives = tab[:p][:t] + tab[:n][:f]
+      negatives = tab[:n][:t] + tab[:p][:f]
+      total     = positives + negatives
+      {
+        total_population: positives + negatives,
+        condition_positive: positives,
+        condition_negative: negatives,
+        true_positive: tab[:p][:t],
+        true_negative: tab[:n][:t],
+        false_positive: tab[:p][:f],
+        false_negative: tab[:n][:f],
+        prevalence: divide(positives, total),
+        specificity: divide(tab[:n][:t], negatives),
+        recall: divide(tab[:p][:t], positives),
+        precision: divide(tab[:p][:t], tab[:p][:t] + tab[:p][:f]),
+        accuracy: divide(tab[:p][:t] + tab[:n][:t], total),
+        f1_score: divide(2 * tab[:p][:t], 2 * tab[:p][:t] + tab[:p][:f] + tab[:n][:f])
+      }
+    end
+    def print_derivations(derivations)
+      max_len = derivations.keys.map(&:length).max
+      derivations.each do |k, v|
+        puts k.to_s.tr('_', ' ').capitalize.ljust(max_len) + ' : ' + v.to_s
+      end
+    end
+    def empty_conf_mat(categories)
+      categories.map { |actual| [actual, categories.map { |predicted| [predicted, 0] }.to_h] }.to_h
+    end
+    def divide(dividend, divisor)
+      divisor.zero? ? 0.0 : dividend / divisor.to_f
+    end
+  end
+end

data/lib/classifier-reborn/version.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 module ClassifierReborn
-  VERSION = '2.0.4'
+  VERSION = '2.3.0'
 end

data/lib/classifier-reborn.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 #--
 # Copyright (c) 2005 Lucas Carlson
 #
@@ -25,6 +27,15 @@
 # License::   LGPL
 require 'rubygems'
+case RUBY_PLATFORM
+when 'java'
+  require 'jruby-stemmer'
+else
+  require 'fast-stemmer'
+end
 require_relative 'classifier-reborn/category_namer'
 require_relative 'classifier-reborn/bayes'
-require_relative 'classifier-reborn/lsi'
+require_relative 'classifier-reborn/lsi'
+require_relative 'classifier-reborn/validators/classifier_validator'