RubyGems - classifier - Versions diffs - 1.4.4 → 2.1.0 - Mend

classifier 1.4.4 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/CLAUDE.md +77 -0
data/README.md +274 -0
data/ext/classifier/classifier_ext.c +25 -0
data/ext/classifier/extconf.rb +15 -0
data/ext/classifier/linalg.h +64 -0
data/ext/classifier/matrix.c +387 -0
data/ext/classifier/svd.c +208 -0
data/ext/classifier/vector.c +319 -0
data/lib/classifier/bayes.rb +294 -60
data/lib/classifier/errors.rb +16 -0
data/lib/classifier/extensions/vector.rb +42 -26
data/lib/classifier/extensions/word_hash.rb +8 -1
data/lib/classifier/lsi/content_node.rb +30 -9
data/lib/classifier/lsi/word_list.rb +12 -1
data/lib/classifier/lsi.rb +479 -125
data/lib/classifier/storage/base.rb +50 -0
data/lib/classifier/storage/file.rb +51 -0
data/lib/classifier/storage/memory.rb +49 -0
data/lib/classifier/storage.rb +9 -0
data/lib/classifier.rb +2 -0
data/sig/vendor/fast_stemmer.rbs +9 -0
data/sig/vendor/gsl.rbs +27 -0
data/sig/vendor/json.rbs +4 -0
data/sig/vendor/matrix.rbs +26 -0
data/sig/vendor/mutex_m.rbs +16 -0
data/test/test_helper.rb +13 -1
metadata +71 -10
data/lib/classifier/extensions/vector_serialize.rb +0 -18

data/lib/classifier/lsi/content_node.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# rbs_inline: enabled
 # Author::    David Fayram  (mailto:dfayram@lensmen.net)
 # Copyright:: Copyright (c) 2005 David Fayram II
 # License::   LGPL
@@ -7,34 +9,49 @@ module Classifier
   # raw_vector_with, it should be fairly straightforward to understand.
   # You should never have to use it directly.
   class ContentNode
-    attr_accessor :raw_vector, :raw_norm,
-                  :lsi_vector, :lsi_norm,
-                  :categories
+    # @rbs @word_hash: Hash[Symbol, Integer]
+    # @rbs @raw_vector: untyped
+    # @rbs @raw_norm: untyped
+    # @rbs @lsi_vector: untyped
+    # @rbs @lsi_norm: untyped
+    attr_accessor :raw_vector, :raw_norm, :lsi_vector, :lsi_norm
+    # @rbs @categories: Array[String | Symbol]
+    attr_accessor :categories
     attr_reader :word_hash
     # If text_proc is not specified, the source will be duck-typed
     # via source.to_s
+    #
+    # @rbs (Hash[Symbol, Integer], *String | Symbol) -> void
     def initialize(word_frequencies, *categories)
       @categories = categories || []
       @word_hash = word_frequencies
     end
     # Use this to fetch the appropriate search vector.
+    #
+    # @rbs () -> untyped
     def search_vector
       @lsi_vector || @raw_vector
     end
     # Use this to fetch the appropriate search vector in normalized form.
+    #
+    # @rbs () -> untyped
     def search_norm
       @lsi_norm || @raw_norm
     end
     # Creates the raw vector out of word_hash using word_list as the
     # key for mapping the vector space.
+    #
+    # @rbs (WordList) -> untyped
     def raw_vector_with(word_list)
-      vec = if $GSL
-              GSL::Vector.alloc(word_list.size)
+      vec = if Classifier::LSI.native_available?
+              Classifier::LSI.vector_class.alloc(word_list.size)
             else
               Array.new(word_list.size, 0)
             end
@@ -44,8 +61,9 @@ module Classifier
       end
       # Perform the scaling transform
-      total_words = $GSL ? vec.sum : vec.sum_with_identity
-      total_unique_words = vec.count { |word| word != 0 }
+      total_words = Classifier::LSI.native_available? ? vec.sum : vec.sum_with_identity
+      vec_array = Classifier::LSI.native_available? ? vec.to_a : vec
+      total_unique_words = vec_array.count { |word| word != 0 }
       # Perform first-order association transform if this vector has more
       # than one word in it.
@@ -60,10 +78,13 @@ module Classifier
           val = term_over_total * Math.log(term_over_total)
           weighted_total += val unless val.nan?
         end
-        vec = vec.collect { |val| Math.log(val + 1) / -weighted_total }
+        sign = weighted_total.negative? ? 1.0 : -1.0
+        divisor = sign * [weighted_total.abs, Vector::EPSILON].max
+        vec = vec.collect { |val| Math.log(val + 1) / divisor }
       end
-      if $GSL
+      if Classifier::LSI.native_available?
         @raw_norm   = vec.normalize
         @raw_vector = vec
       else

data/lib/classifier/lsi/word_list.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# rbs_inline: enabled
 # Author::    David Fayram  (mailto:dfayram@lensmen.net)
 # Copyright:: Copyright (c) 2005 David Fayram II
 # License::   LGPL
@@ -5,29 +7,38 @@
 module Classifier
   # This class keeps a word => index mapping. It is used to map stemmed words
   # to dimensions of a vector.
   class WordList
+    # @rbs @location_table: Hash[Symbol, Integer]
+    # @rbs () -> void
     def initialize
       @location_table = {}
     end
     # Adds a word (if it is new) and assigns it a unique dimension.
+    #
+    # @rbs (Symbol) -> Integer?
     def add_word(word)
       term = word
       @location_table[term] = @location_table.size unless @location_table[term]
     end
     # Returns the dimension of the word or nil if the word is not in the space.
+    #
+    # @rbs (Symbol) -> Integer?
     def [](lookup)
       term = lookup
       @location_table[term]
     end
+    # @rbs (Integer) -> Symbol?
     def word_for_index(ind)
       @location_table.invert[ind]
     end
     # Returns the number of words mapped.
+    #
+    # @rbs () -> Integer
     def size
       @location_table.size
     end