RubyGems - classifier - Versions diffs - 1.4.4 → 2.1.0 - Mend

classifier 1.4.4 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/CLAUDE.md +77 -0
data/README.md +274 -0
data/ext/classifier/classifier_ext.c +25 -0
data/ext/classifier/extconf.rb +15 -0
data/ext/classifier/linalg.h +64 -0
data/ext/classifier/matrix.c +387 -0
data/ext/classifier/svd.c +208 -0
data/ext/classifier/vector.c +319 -0
data/lib/classifier/bayes.rb +294 -60
data/lib/classifier/errors.rb +16 -0
data/lib/classifier/extensions/vector.rb +42 -26
data/lib/classifier/extensions/word_hash.rb +8 -1
data/lib/classifier/lsi/content_node.rb +30 -9
data/lib/classifier/lsi/word_list.rb +12 -1
data/lib/classifier/lsi.rb +479 -125
data/lib/classifier/storage/base.rb +50 -0
data/lib/classifier/storage/file.rb +51 -0
data/lib/classifier/storage/memory.rb +49 -0
data/lib/classifier/storage.rb +9 -0
data/lib/classifier.rb +2 -0
data/sig/vendor/fast_stemmer.rbs +9 -0
data/sig/vendor/gsl.rbs +27 -0
data/sig/vendor/json.rbs +4 -0
data/sig/vendor/matrix.rbs +26 -0
data/sig/vendor/mutex_m.rbs +16 -0
data/test/test_helper.rb +13 -1
metadata +71 -10
data/lib/classifier/extensions/vector_serialize.rb +0 -18

data/lib/classifier/bayes.rb CHANGED Viewed

@@ -1,39 +1,68 @@
+# rbs_inline: enabled
 # Author::    Lucas Carlson  (mailto:lucas@rufy.com)
 # Copyright:: Copyright (c) 2005 Lucas Carlson
 # License::   LGPL
+require 'json'
+require 'mutex_m'
 module Classifier
   class Bayes
+    include Mutex_m
+    # @rbs @categories: Hash[Symbol, Hash[Symbol, Integer]]
+    # @rbs @total_words: Integer
+    # @rbs @category_counts: Hash[Symbol, Integer]
+    # @rbs @category_word_count: Hash[Symbol, Integer]
+    # @rbs @cached_training_count: Float?
+    # @rbs @cached_vocab_size: Integer?
+    # @rbs @dirty: bool
+    # @rbs @storage: Storage::Base?
+    attr_accessor :storage
     # The class can be created with one or more categories, each of which will be
     # initialized and given a training method. E.g.,
     #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
+    # @rbs (*String | Symbol) -> void
     def initialize(*categories)
+      super()
       @categories = {}
       categories.each { |category| @categories[category.prepare_category_name] = {} }
       @total_words = 0
       @category_counts = Hash.new(0)
       @category_word_count = Hash.new(0)
+      @cached_training_count = nil
+      @cached_vocab_size = nil
+      @dirty = false
+      @storage = nil
     end
-    #
     # Provides a general training method for all categories specified in Bayes#new
     # For example:
     #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
     #     b.train :this, "This text"
     #     b.train "that", "That text"
     #     b.train "The other", "The other text"
+    #
+    # @rbs (String | Symbol, String) -> void
     def train(category, text)
       category = category.prepare_category_name
-      @category_counts[category] += 1
-      text.word_hash.each do |word, count|
-        @categories[category][word] ||= 0
-        @categories[category][word] += count
-        @total_words += count
-        @category_word_count[category] += count
+      word_hash = text.word_hash
+      synchronize do
+        invalidate_caches
+        @dirty = true
+        @category_counts[category] += 1
+        word_hash.each do |word, count|
+          @categories[category][word] ||= 0
+          @categories[category][word] += count
+          @total_words += count
+          @category_word_count[category] += count
+        end
       end
     end
-    #
     # Provides a untraining method for all categories specified in Bayes#new
     # Be very careful with this method.
     #
@@ -41,54 +70,179 @@ module Classifier
     #     b = Classifier::Bayes.new 'This', 'That', 'the_other'
     #     b.train :this, "This text"
     #     b.untrain :this, "This text"
+    #
+    # @rbs (String | Symbol, String) -> void
     def untrain(category, text)
       category = category.prepare_category_name
-      @category_counts[category] -= 1
-      text.word_hash.each do |word, count|
-        next unless @total_words >= 0
-        orig = @categories[category][word] || 0
-        @categories[category][word] ||= 0
-        @categories[category][word] -= count
-        if @categories[category][word] <= 0
-          @categories[category].delete(word)
-          count = orig
+      word_hash = text.word_hash
+      synchronize do
+        invalidate_caches
+        @dirty = true
+        @category_counts[category] -= 1
+        word_hash.each do |word, count|
+          next unless @total_words >= 0
+          orig = @categories[category][word] || 0
+          @categories[category][word] ||= 0
+          @categories[category][word] -= count
+          if @categories[category][word] <= 0
+            @categories[category].delete(word)
+            count = orig
+          end
+          @category_word_count[category] -= count if @category_word_count[category] >= count
+          @total_words -= count
         end
-        @category_word_count[category] -= count if @category_word_count[category] >= count
-        @total_words -= count
       end
     end
-    #
     # Returns the scores in each category the provided +text+. E.g.,
     #    b.classifications "I hate bad words and you"
     #    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
     # The largest of these scores (the one closest to 0) is the one picked out by #classify
+    #
+    # @rbs (String) -> Hash[String, Float]
     def classifications(text)
-      score = {}
-      word_hash = text.word_hash
-      training_count = @category_counts.values.inject { |x, y| x + y }.to_f
-      @categories.each do |category, category_words|
-        score[category.to_s] = 0
-        total = (@category_word_count[category] || 1).to_f
-        word_hash.each_key do |word|
-          s = category_words.key?(word) ? category_words[word] : 0.1
-          score[category.to_s] += Math.log(s / total)
+      words = text.word_hash.keys
+      synchronize do
+        training_count = cached_training_count
+        vocab_size = cached_vocab_size
+        @categories.to_h do |category, category_words|
+          smoothed_total = ((@category_word_count[category] || 0) + vocab_size).to_f
+          # Laplace smoothing: P(word|category) = (count + α) / (total + α * V)
+          word_score = words.sum { |w| Math.log(((category_words[w] || 0) + 1) / smoothed_total) }
+          prior_score = Math.log((@category_counts[category] || 0.1) / training_count)
+          [category.to_s, word_score + prior_score]
         end
-        # now add prior probability for the category
-        s = @category_counts.key?(category) ? @category_counts[category] : 0.1
-        score[category.to_s] += Math.log(s / training_count)
       end
-      score
     end
-    #
     # Returns the classification of the provided +text+, which is one of the
     # categories given in the initializer. E.g.,
     #    b.classify "I hate bad words and you"
     #    =>  'Uninteresting'
+    #
+    # @rbs (String) -> String
     def classify(text)
-      (classifications(text).sort_by { |a| -a[1] })[0][0]
+      best = classifications(text).min_by { |a| -a[1] }
+      raise StandardError, 'No classifications available' unless best
+      best.first.to_s
+    end
+    # Returns a hash representation of the classifier state.
+    # This can be converted to JSON or used directly.
+    #
+    # @rbs () -> untyped
+    def as_json(*)
+      {
+        version: 1,
+        type: 'bayes',
+        categories: @categories.transform_keys(&:to_s).transform_values { |v| v.transform_keys(&:to_s) },
+        total_words: @total_words,
+        category_counts: @category_counts.transform_keys(&:to_s),
+        category_word_count: @category_word_count.transform_keys(&:to_s)
+      }
+    end
+    # Serializes the classifier state to a JSON string.
+    # This can be saved to a file and later loaded with Bayes.from_json.
+    #
+    # @rbs () -> String
+    def to_json(*)
+      as_json.to_json
+    end
+    # Loads a classifier from a JSON string or a Hash created by #to_json or #as_json.
+    #
+    # @rbs (String | Hash[String, untyped]) -> Bayes
+    def self.from_json(json)
+      data = json.is_a?(String) ? JSON.parse(json) : json
+      raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'bayes'
+      instance = allocate
+      instance.send(:restore_state, data)
+      instance
+    end
+    # Saves the classifier to the configured storage.
+    # Raises ArgumentError if no storage is configured.
+    #
+    # @rbs () -> void
+    def save
+      raise ArgumentError, 'No storage configured. Use save_to_file(path) or set storage=' unless storage
+      storage.write(to_json)
+      @dirty = false
+    end
+    # Saves the classifier state to a file (legacy API).
+    #
+    # @rbs (String) -> Integer
+    def save_to_file(path)
+      result = File.write(path, to_json)
+      @dirty = false
+      result
+    end
+    # Reloads the classifier from the configured storage.
+    # Raises UnsavedChangesError if there are unsaved changes.
+    # Use reload! to force reload and discard changes.
+    #
+    # @rbs () -> self
+    def reload
+      raise ArgumentError, 'No storage configured' unless storage
+      raise UnsavedChangesError, 'Unsaved changes would be lost. Call save first or use reload!' if @dirty
+      data = storage.read
+      raise StorageError, 'No saved state found' unless data
+      restore_from_json(data)
+      @dirty = false
+      self
+    end
+    # Force reloads the classifier from storage, discarding any unsaved changes.
+    #
+    # @rbs () -> self
+    def reload!
+      raise ArgumentError, 'No storage configured' unless storage
+      data = storage.read
+      raise StorageError, 'No saved state found' unless data
+      restore_from_json(data)
+      @dirty = false
+      self
+    end
+    # Returns true if there are unsaved changes.
+    #
+    # @rbs () -> bool
+    def dirty?
+      @dirty
+    end
+    # Loads a classifier from the configured storage.
+    # The storage is set on the returned instance.
+    #
+    # @rbs (storage: Storage::Base) -> Bayes
+    def self.load(storage:)
+      data = storage.read
+      raise StorageError, 'No saved state found' unless data
+      instance = from_json(data)
+      instance.storage = storage
+      instance
+    end
+    # Loads a classifier from a file (legacy API).
+    #
+    # @rbs (String) -> Bayes
+    def self.load_from_file(path)
+      from_json(File.read(path))
     end
     #
@@ -100,32 +254,30 @@ module Classifier
     #     b.untrain_that "That text"
     #     b.train_the_other "The other text"
     def method_missing(name, *args)
+      return super unless name.to_s =~ /(un)?train_(\w+)/
       category = name.to_s.gsub(/(un)?train_(\w+)/, '\2').prepare_category_name
-      if @categories.key?(category)
-        args.each do |text|
-          if name.to_s.start_with?('untrain_')
-            untrain(category, text)
-          else
-            train(category, text)
-          end
-        end
-      elsif name.to_s =~ /(un)?train_(\w+)/
-        raise StandardError, "No such category: #{category}"
-      else
-        super
-      end
+      raise StandardError, "No such category: #{category}" unless @categories.key?(category)
+      method = name.to_s.start_with?('untrain_') ? :untrain : :train
+      args.each { |text| send(method, category, text) }
+    end
+    # @rbs (Symbol, ?bool) -> bool
+    def respond_to_missing?(name, include_private = false)
+      !!(name.to_s =~ /(un)?train_(\w+)/) || super
     end
-    #
     # Provides a list of category names
     # For example:
     #     b.categories
     #     =>   ['This', 'That', 'the_other']
-    def categories # :nodoc:
-      @categories.keys.collect(&:to_s)
+    #
+    # @rbs () -> Array[String]
+    def categories
+      synchronize { @categories.keys.collect(&:to_s) }
     end
-    #
     # Allows you to add categories to the classifier.
     # For example:
     #     b.add_category "Not spam"
@@ -134,13 +286,34 @@ module Classifier
     # result in an undertrained category that will tend to match
     # more criteria than the trained selective categories. In short,
     # try to initialize your categories at initialization.
+    #
+    # @rbs (String | Symbol) -> Hash[Symbol, Integer]
     def add_category(category)
-      @categories[category.prepare_category_name] = {}
+      synchronize do
+        invalidate_caches
+        @dirty = true
+        @categories[category.prepare_category_name] = {}
+      end
     end
     alias append_category add_category
-    #
+    # Custom marshal serialization to exclude mutex state
+    # @rbs () -> Array[untyped]
+    def marshal_dump
+      [@categories, @total_words, @category_counts, @category_word_count, @dirty]
+    end
+    # Custom marshal deserialization to recreate mutex
+    # @rbs (Array[untyped]) -> void
+    def marshal_load(data)
+      mu_initialize
+      @categories, @total_words, @category_counts, @category_word_count, @dirty = data
+      @cached_training_count = nil
+      @cached_vocab_size = nil
+      @storage = nil
+    end
     # Allows you to remove categories from the classifier.
     # For example:
     #     b.remove_category "Spam"
@@ -148,15 +321,76 @@ module Classifier
     # WARNING: Removing categories from a trained classifier will
     # result in the loss of all training data for that category.
     # Make sure you really want to do this before calling this method.
+    #
+    # @rbs (String | Symbol) -> void
     def remove_category(category)
       category = category.prepare_category_name
-      raise StandardError, "No such category: #{category}" unless @categories.key?(category)
+      synchronize do
+        raise StandardError, "No such category: #{category}" unless @categories.key?(category)
+        invalidate_caches
+        @dirty = true
+        @total_words -= @category_word_count[category].to_i
-      @total_words -= @category_word_count[category].to_i
+        @categories.delete(category)
+        @category_counts.delete(category)
+        @category_word_count.delete(category)
+      end
+    end
+    private
+    # Restores classifier state from a JSON string (used by reload)
+    # @rbs (String) -> void
+    def restore_from_json(json)
+      data = JSON.parse(json)
+      raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'bayes'
+      synchronize do
+        restore_state(data)
+      end
+    end
+    # Restores classifier state from a hash (used by from_json)
+    # @rbs (Hash[String, untyped]) -> void
+    def restore_state(data)
+      mu_initialize
+      @categories = {} #: Hash[Symbol, Hash[Symbol, Integer]]
+      @total_words = data['total_words']
+      @category_counts = Hash.new(0) #: Hash[Symbol, Integer]
+      @category_word_count = Hash.new(0) #: Hash[Symbol, Integer]
+      @cached_training_count = nil
+      @cached_vocab_size = nil
+      @dirty = false
+      @storage = nil
+      data['categories'].each do |cat_name, words|
+        @categories[cat_name.to_sym] = words.transform_keys(&:to_sym)
+      end
+      data['category_counts'].each do |cat_name, count|
+        @category_counts[cat_name.to_sym] = count
+      end
+      data['category_word_count'].each do |cat_name, count|
+        @category_word_count[cat_name.to_sym] = count
+      end
+    end
+    # @rbs () -> void
+    def invalidate_caches
+      @cached_training_count = nil
+      @cached_vocab_size = nil
+    end
+    # @rbs () -> Float
+    def cached_training_count
+      @cached_training_count ||= @category_counts.values.sum.to_f
+    end
-      @categories.delete(category)
-      @category_counts.delete(category)
-      @category_word_count.delete(category)
+    # @rbs () -> Integer
+    def cached_vocab_size
+      @cached_vocab_size ||= [@categories.values.flat_map(&:keys).uniq.size, 1].max
     end
   end
 end

data/lib/classifier/errors.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# rbs_inline: enabled
+# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
+# Copyright:: Copyright (c) 2005 Lucas Carlson
+# License::   LGPL
+module Classifier
+  # Base error class for all Classifier errors
+  class Error < StandardError; end
+  # Raised when reload would discard unsaved changes
+  class UnsavedChangesError < Error; end
+  # Raised when a storage operation fails
+  class StorageError < Error; end
+end

data/lib/classifier/extensions/vector.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# rbs_inline: enabled
 # Author::    Ernest Ellingson
 # Copyright:: Copyright (c) 2005
@@ -5,30 +7,41 @@
 require 'matrix'
+# @rbs skip
 class Array
-  def sum_with_identity(identity = 0.0, &block)
+  def sum_with_identity(identity = 0.0, &)
     return identity unless size.to_i.positive?
+    return map(&).sum_with_identity(identity) if block_given?
-    if block_given?
-      map(&block).sum_with_identity(identity)
-    else
-      compact.reduce(:+).to_f || identity.to_f
-    end
+    compact.reduce(identity, :+).to_f
   end
 end
-module VectorExtensions
+# @rbs skip
+class Vector
+  EPSILON = 1e-10
+  # Cache magnitude since Vector is immutable after creation
+  # Note: We undefine the matrix gem's normalize method first, then redefine it
+  # to provide a more robust implementation that handles zero vectors
+  undef_method :normalize if method_defined?(:normalize)
   def magnitude
-    sum_of_squares = 0.to_r
-    size.times do |i|
-      sum_of_squares += self[i]**2.to_r
+    # Cache magnitude since Vector is immutable after creation
+    @magnitude ||= begin
+      sum_of_squares = 0.to_r
+      size.times do |i|
+        sum_of_squares += self[i]**2.to_r
+      end
+      Math.sqrt(sum_of_squares.to_f)
     end
-    Math.sqrt(sum_of_squares.to_f)
   end
   def normalize
+    magnitude_value = magnitude
+    return Vector[*Array.new(size, 0.0)] if magnitude_value <= 0.0
     normalized_values = []
-    magnitude_value = magnitude.to_r
     size.times do |i|
       normalized_values << (self[i] / magnitude_value)
     end
@@ -36,10 +49,7 @@ module VectorExtensions
   end
 end
-class Vector
-  include VectorExtensions
-end
+# @rbs skip
 class Matrix
   def self.diag(diagonal_elements)
     Matrix.diagonal(*diagonal_elements)
@@ -61,14 +71,19 @@ class Matrix
     loop do
       iteration_count += 1
-      (0...q_rotation_matrix.row_size - 1).each do |row|
-        (1..q_rotation_matrix.row_size - 1).each do |col|
+      (0...(q_rotation_matrix.row_size - 1)).each do |row|
+        (1..(q_rotation_matrix.row_size - 1)).each do |col|
           next if row == col
-          angle = Math.atan((2.to_r * q_rotation_matrix[row,
-                                                        col]) / (q_rotation_matrix[row,
-                                                                                   row] - q_rotation_matrix[col,
-                                                                                                            col])) / 2.0
+          numerator = 2.0 * q_rotation_matrix[row, col]
+          denominator = q_rotation_matrix[row, row] - q_rotation_matrix[col, col]
+          angle = if denominator.abs < Vector::EPSILON
+                    numerator >= 0 ? Math::PI / 4.0 : -Math::PI / 4.0
+                  else
+                    Math.atan(numerator / denominator) / 2.0
+                  end
           cosine = Math.cos(angle)
           sine = Math.sin(angle)
           rotation_matrix = Matrix.identity(q_rotation_matrix.row_size)
@@ -92,11 +107,12 @@ class Matrix
       break if (sum_of_differences <= 0.001 && iteration_count > 1) || iteration_count >= max_sweeps
     end
-    singular_values = []
-    q_rotation_matrix.row_size.times do |r|
-      singular_values << Math.sqrt(q_rotation_matrix[r, r].to_f)
+    singular_values = q_rotation_matrix.row_size.times.map do |r|
+      Math.sqrt([q_rotation_matrix[r, r].to_f, 0.0].max)
     end
-    u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*singular_values).inverse
+    safe_singular_values = singular_values.map { |v| [v, Vector::EPSILON].max }
+    u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*safe_singular_values).inverse
     [u_matrix, v_matrix, singular_values]
   end

data/lib/classifier/extensions/word_hash.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# rbs_inline: enabled
 # Author::    Lucas Carlson  (mailto:lucas@rufy.com)
 # Copyright:: Copyright (c) 2005 Lucas Carlson
 # License::   LGPL
@@ -11,12 +13,14 @@ class String
   # E.g.,
   #   "Hello (greeting's), with {braces} < >...?".without_punctuation
   #   => "Hello  greetings   with  braces         "
+  # @rbs () -> String
   def without_punctuation
-    tr(',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', ' ').tr("'\-", '')
+    tr(',?.!;:"@#$%^&*()_=+[]{}|<>/`~', ' ').tr("'-", '')
   end
   # Return a Hash of strings => ints. Each word in the string is stemmed,
   # interned, and indexes to its frequency in the document.
+  # @rbs () -> Hash[Symbol, Integer]
   def word_hash
     word_hash = clean_word_hash
     symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
@@ -24,12 +28,14 @@ class String
   end
   # Return a word hash without extra punctuation or short symbols, just stemmed words
+  # @rbs () -> Hash[Symbol, Integer]
   def clean_word_hash
     word_hash_for_words gsub(/[^\w\s]/, '').split
   end
   private
+  # @rbs (Array[String]) -> Hash[Symbol, Integer]
   def word_hash_for_words(words)
     d = Hash.new(0)
     words.each do |word|
@@ -39,6 +45,7 @@ class String
     d
   end
+  # @rbs (Array[String]) -> Hash[Symbol, Integer]
   def word_hash_for_symbols(words)
     d = Hash.new(0)
     words.each do |word|