RubyGems - classifier - Versions diffs - 2.3.2 → 2.5.0 - Mend

classifier 2.3.2 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/README.md +21 -5
data/lib/classifier/bayes.rb +39 -21
data/lib/classifier/config.rb +31 -0
data/lib/classifier/extensions/word_hash.rb +9 -9
data/lib/classifier/knn.rb +4 -3
data/lib/classifier/logistic_regression.rb +51 -30
data/lib/classifier/lsi.rb +42 -17
data/lib/classifier/streaming.rb +2 -2
data/lib/classifier/tfidf.rb +15 -7
data/lib/classifier/version.rb +1 -1
data/lib/classifier.rb +1 -0
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ef8c75a0dfe0e6da3e2a7cf49d970c738913515b2f29f8759b8c2bc0df5ac8d0
-  data.tar.gz: 03b67f752c656af13f87edfd32fe56fe201e1a0521978b06ed6139a5d01569a9
+  metadata.gz: 539130382c7ce45072e515ee9817f47fb1144af18498a81c27c7dda842477141
+  data.tar.gz: 1537f2c7c164ec70c14c7b314148fa81cfb947a93dfe328125082567c5a99a7e
 SHA512:
-  metadata.gz: 42ac44b3698b864e27a47a79c468971e4c33f73e465a0c552715a93422e1cbbe009c57275378a330dd00607d4d3048590c805373e85f5bf31ecfaca4504ed311
-  data.tar.gz: 41fe7ff430019ff41a97a110784419049914f3a86cdcc383349c58770ab5419cbcb6bb99ae930d08058e1003d4efc8026616317b2096352abe473d54203e99c5
+  metadata.gz: 3bf1385063c020bb08097192417232a6dc3fce7a16a057bec27ccf62253d69809fedc692f597b7ab6c4e36e8236629188e37add798d883ed670a55cb02c37023
+  data.tar.gz: '0528f07ae03ab30b752c3242b747058956831900c55c1c5a858ab623525075806a52fc8b66236de0f4b959ce678f9a24204887434b19c8662860d06173b2634f'

data/README.md CHANGED Viewed

@@ -27,7 +27,7 @@ gem 'classifier'
 Or install via Homebrew for CLI-only usage:
 ```bash
-brew install classifier
+brew install cardmagic/tap/classifier
 ```
 ## Command Line
@@ -36,15 +36,15 @@ Classify text instantly with pre-trained models—no coding required:
 ```bash
 # Detect spam
-classifier classify "You won a free iPhone!" -r sms-spam-filter
+classifier -r sms-spam-filter "You won a free iPhone"
 # => spam
 # Analyze sentiment
-classifier classify "This movie was absolutely amazing!" -r imdb-sentiment
+classifier -r imdb-sentiment "This movie was absolutely amazing"
 # => positive
 # Detect emotions
-classifier classify "I'm so happy today!" -r emotion-detection
+classifier -r emotion-detection "I am so happy today"
 # => joy
 # List all available models
@@ -59,12 +59,28 @@ classifier train positive reviews/good/*.txt
 classifier train negative reviews/bad/*.txt
 # Classify new text
-classifier classify "Great product, highly recommend"
+classifier "Great product, highly recommend"
 # => positive
 ```
 [CLI Guide →](https://rubyclassifier.com/docs/guides/cli/basics)
+### Claude Code Plugin
+Install as a plugin to get skills (auto-invoked) and slash commands:
+```bash
+# Add the marketplace
+claude plugin marketplace add cardmagic/ai-marketplace
+# Install the plugin
+claude plugin install classifier@cardmagic
+```
+This gives you:
+- **Skill**: Claude automatically classifies text when you ask about spam, sentiment, or emotions
+- **Slash commands**: `/classifier:classify`, `/classifier:train`, `/classifier:models`
 ## Quick Start
 ### Bayesian

data/lib/classifier/bayes.rb CHANGED Viewed

@@ -20,6 +20,7 @@ module Classifier
     # @rbs @cached_vocab_size: Integer?
     # @rbs @dirty: bool
     # @rbs @storage: Storage::Base?
+    # @rbs @min_word_length: Integer
     attr_accessor :storage
@@ -27,8 +28,9 @@ module Classifier
     # initialized and given a training method. E.g.,
     #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
     #      b = Classifier::Bayes.new ['Interesting', 'Uninteresting', 'Spam']
-    # @rbs (*String | Symbol | Array[String | Symbol]) -> void
-    def initialize(*categories)
+    #      b = Classifier::Bayes.new 'Spam', min_word_length: 1
+    # @rbs (*String | Symbol | Array[String | Symbol], ?min_word_length: Integer) -> void
+    def initialize(*categories, min_word_length: Classifier.config.min_word_length)
       super()
       @categories = {}
       categories.flatten.each { |category| @categories[category.prepare_category_name] = {} }
@@ -39,6 +41,7 @@ module Classifier
       @cached_vocab_size = nil
       @dirty = false
       @storage = nil
+      @min_word_length = min_word_length
     end
     # Trains the classifier with text for a category.
@@ -76,7 +79,7 @@ module Classifier
     #
     # @rbs (String) -> Hash[String, Float]
     def classifications(text)
-      words = text.word_hash.keys
+      words = text.word_hash(@min_word_length).keys
       synchronize do
         training_count = cached_training_count
         vocab_size = cached_vocab_size
@@ -117,7 +120,8 @@ module Classifier
         categories: @categories.transform_keys(&:to_s).transform_values { |v| v.transform_keys(&:to_s) },
         total_words: @total_words,
         category_counts: @category_counts.transform_keys(&:to_s),
-        category_word_count: @category_word_count.transform_keys(&:to_s)
+        category_word_count: @category_word_count.transform_keys(&:to_s),
+        min_word_length: @min_word_length
       }
     end
@@ -324,20 +328,14 @@ module Classifier
     #     puts "#{progress.completed} documents processed"
     #   end
     #
-    # @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
-    def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
-      category = category.prepare_category_name
-      raise StandardError, "No such category: #{category}" unless @categories.key?(category)
-      reader = Streaming::LineReader.new(io, batch_size: batch_size)
-      total = reader.estimate_line_count
-      progress = Streaming::Progress.new(total: total)
-      reader.each_batch do |batch|
-        train_batch_internal(category, batch)
-        progress.completed += batch.size
-        progress.current_batch += 1
-        yield progress if block_given?
+    # @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
+    def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
+      raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
+      raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
+      pairs = category && io ? { category => io } : categories
+      pairs.each do |cat, stream|
+        stream_train_category(cat, stream, batch_size: batch_size, &)
       end
     end
@@ -385,6 +383,25 @@ module Classifier
     private
+    # Trains from an IO stream with a single category.
+    # @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
+    def stream_train_category(category, io, batch_size:)
+      category = category.prepare_category_name
+      raise ArgumentError, "No such category: #{category}" unless @categories.key?(category)
+      raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
+      reader = Streaming::LineReader.new(io, batch_size: batch_size)
+      total = reader.estimate_line_count
+      progress = Streaming::Progress.new(total: total)
+      reader.each_batch do |batch|
+        train_batch_internal(category, batch)
+        progress.completed += batch.size
+        progress.current_batch += 1
+        yield progress if block_given?
+      end
+    end
     # Trains a batch of documents for a single category.
     # @rbs (String | Symbol, Array[String], ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
     def train_batch_for_category(category, documents, batch_size: Streaming::DEFAULT_BATCH_SIZE)
@@ -409,7 +426,7 @@ module Classifier
         invalidate_caches
         @dirty = true
         batch.each do |text|
-          word_hash = text.word_hash
+          word_hash = text.word_hash(@min_word_length)
           @category_counts[category] += 1
           word_hash.each do |word, count|
             @categories[category][word] ||= 0
@@ -425,7 +442,7 @@ module Classifier
     # @rbs (String | Symbol, String) -> void
     def train_single(category, text)
       category = category.prepare_category_name
-      word_hash = text.word_hash
+      word_hash = text.word_hash(@min_word_length)
       synchronize do
         invalidate_caches
         @dirty = true
@@ -443,7 +460,7 @@ module Classifier
     # @rbs (String | Symbol, String) -> void
     def untrain_single(category, text)
       category = category.prepare_category_name
-      word_hash = text.word_hash
+      word_hash = text.word_hash(@min_word_length)
       synchronize do
         invalidate_caches
         @dirty = true
@@ -487,6 +504,7 @@ module Classifier
       @cached_vocab_size = nil
       @dirty = false
       @storage = nil
+      @min_word_length = data['min_word_length'] || Classifier.config.min_word_length
       data['categories'].each do |cat_name, words|
         @categories[cat_name.to_sym] = words.transform_keys(&:to_sym)

data/lib/classifier/config.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# rbs_inline: enabled
+module Classifier
+  # @rbs @config: Config?
+  # This lazy initialization is not thread-safe.
+  # In multi-threaded environments, ensure this method is called
+  # or configuration is set explicitly during startup before using classifiers.
+  # @rbs () -> Config
+  def config
+    @config ||= Config.new
+  end
+  # @rbs () { (Config) -> void } -> void
+  def configure(&block)
+    block&.call(config)
+  end
+  module_function :config, :configure
+  class Config
+    # @rbs @min_word_length: Integer
+    attr_accessor :min_word_length #: Integer
+    # @rbs () -> void
+    def initialize
+      @min_word_length = 3
+    end
+  end
+end

data/lib/classifier/extensions/word_hash.rb CHANGED Viewed

@@ -20,27 +20,27 @@ class String
   # Return a Hash of strings => ints. Each word in the string is stemmed,
   # interned, and indexes to its frequency in the document.
-  # @rbs () -> Hash[Symbol, Integer]
-  def word_hash
-    word_hash = clean_word_hash
+  # @rbs (?Integer) -> Hash[Symbol, Integer]
+  def word_hash(min_word_length = 3)
+    word_hash = clean_word_hash(min_word_length)
     symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
     word_hash.merge(symbol_hash)
   end
   # Return a word hash without extra punctuation or short symbols, just stemmed words
-  # @rbs () -> Hash[Symbol, Integer]
-  def clean_word_hash
-    word_hash_for_words gsub(/[^\w\s]/, '').split
+  # @rbs (?Integer) -> Hash[Symbol, Integer]
+  def clean_word_hash(min_word_length = 3)
+    word_hash_for_words(gsub(/[^\w\s]/, '').split, min_word_length)
   end
   private
-  # @rbs (Array[String]) -> Hash[Symbol, Integer]
-  def word_hash_for_words(words)
+  # @rbs (Array[String], Integer) -> Hash[Symbol, Integer]
+  def word_hash_for_words(words, min_word_length)
     d = Hash.new(0)
     words.each do |word|
       word.downcase!
-      d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
+      d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length >= min_word_length
     end
     d
   end

data/lib/classifier/knn.rb CHANGED Viewed

@@ -268,9 +268,10 @@ module Classifier
     #     puts "#{progress.completed} documents processed"
     #   end
     #
-    # @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
-    def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE, &block)
-      @lsi.train_from_stream(category, io, batch_size: batch_size, &block)
+    # @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
+    def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
+      # @type var categories: untype
+      @lsi.train_from_stream(category, io, batch_size: batch_size, **categories, &)
       synchronize { @dirty = true }
     end

data/lib/classifier/logistic_regression.rb CHANGED Viewed

@@ -34,6 +34,7 @@ module Classifier
     # @rbs @fitted: bool
     # @rbs @dirty: bool
     # @rbs @storage: Storage::Base?
+    # @rbs @min_word_length: Integer
     attr_accessor :storage
@@ -53,13 +54,16 @@ module Classifier
     # - regularization: L2 regularization strength (default: 0.01)
     # - max_iterations: Maximum training iterations (default: 100)
     # - tolerance: Convergence threshold (default: 1e-4)
+    # - min_word_length: Minimum word length filter in tokenization
     #
     # @rbs (*String | Symbol | Array[String | Symbol], ?learning_rate: Float, ?regularization: Float,
-    #       ?max_iterations: Integer, ?tolerance: Float) -> void
+    #       ?max_iterations: Integer, ?tolerance: Float, ?min_word_length: Integer) -> void
+    # rubocop:disable Metrics/ParameterLists
     def initialize(*categories, learning_rate: DEFAULT_LEARNING_RATE,
                    regularization: DEFAULT_REGULARIZATION,
                    max_iterations: DEFAULT_MAX_ITERATIONS,
-                   tolerance: DEFAULT_TOLERANCE)
+                   tolerance: DEFAULT_TOLERANCE,
+                   min_word_length: Classifier.config.min_word_length)
       super()
       categories = categories.flatten
       @categories = categories.map { |c| c.to_s.prepare_category_name }
@@ -74,7 +78,9 @@ module Classifier
       @fitted = false
       @dirty = false
       @storage = nil
+      @min_word_length = min_word_length
     end
+    # rubocop:enable Metrics/ParameterLists
     # Trains the classifier with text for a category.
     #
@@ -130,7 +136,7 @@ module Classifier
     def probabilities(text)
       raise NotFittedError, 'Model not fitted. Call fit() after training.' unless @fitted
-      features = text.word_hash
+      features = text.word_hash(@min_word_length)
       synchronize do
         softmax(compute_scores(features))
       end
@@ -143,7 +149,7 @@ module Classifier
     def classifications(text)
       raise NotFittedError, 'Model not fitted. Call fit() after training.' unless @fitted
-      features = text.word_hash
+      features = text.word_hash(@min_word_length)
       synchronize do
         compute_scores(features).transform_keys(&:to_s)
       end
@@ -239,7 +245,8 @@ module Classifier
         regularization: @regularization,
         max_iterations: @max_iterations,
         tolerance: @tolerance,
-        fitted: @fitted
+        fitted: @fitted,
+        min_word_length: @min_word_length
       }
     end
@@ -336,7 +343,7 @@ module Classifier
     def marshal_dump
       fit unless @fitted
       [@categories, @weights, @bias, @vocabulary, @learning_rate, @regularization,
-       @max_iterations, @tolerance, @fitted]
+       @max_iterations, @tolerance, @fitted, @min_word_length]
     end
     # Custom marshal deserialization to recreate mutex.
@@ -345,7 +352,7 @@ module Classifier
     def marshal_load(data)
       mu_initialize
       @categories, @weights, @bias, @vocabulary, @learning_rate, @regularization,
-        @max_iterations, @tolerance, @fitted = data
+        @max_iterations, @tolerance, @fitted, @min_word_length = data
       @training_data = []
       @dirty = false
       @storage = nil
@@ -383,28 +390,14 @@ module Classifier
     #   end
     #   classifier.fit
     #
-    # @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
-    def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
-      category = category.to_s.prepare_category_name
-      raise StandardError, "No such category: #{category}" unless @categories.include?(category)
-      reader = Streaming::LineReader.new(io, batch_size: batch_size)
-      total = reader.estimate_line_count
-      progress = Streaming::Progress.new(total: total)
+    # @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
+    def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
+      raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
+      raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
-      reader.each_batch do |batch|
-        synchronize do
-          batch.each do |text|
-            features = text.word_hash
-            features.each_key { |word| @vocabulary[word] = true }
-            @training_data << { category: category, features: features }
-          end
-          @fitted = false
-          @dirty = true
-        end
-        progress.completed += batch.size
-        progress.current_batch += 1
-        yield progress if block_given?
+      pairs = category && io ? { category => io } : categories
+      pairs.each do |cat, stream|
+        stream_train_category(cat, stream, batch_size:, &)
       end
     end
@@ -433,6 +426,33 @@ module Classifier
     private
+    # Trains from an IO stream with a single category.
+    # @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
+    def stream_train_category(category, io, batch_size:)
+      category = category.to_s.prepare_category_name
+      raise ArgumentError, "No such category: #{category}" unless @categories.include?(category)
+      raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
+      reader = Streaming::LineReader.new(io, batch_size: batch_size)
+      total = reader.estimate_line_count
+      progress = Streaming::Progress.new(total: total)
+      reader.each_batch do |batch|
+        synchronize do
+          batch.each do |text|
+            features = text.word_hash(@min_word_length)
+            features.each_key { |word| @vocabulary[word] = true }
+            @training_data << { category: category, features: features }
+          end
+          @fitted = false
+          @dirty = true
+        end
+        progress.completed += batch.size
+        progress.current_batch += 1
+        yield progress if block_given?
+      end
+    end
     # Trains a batch of documents for a single category.
     # @rbs (String | Symbol, Array[String], ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
     def train_batch_for_category(category, documents, batch_size: Streaming::DEFAULT_BATCH_SIZE)
@@ -444,7 +464,7 @@ module Classifier
       documents.each_slice(batch_size) do |batch|
         synchronize do
           batch.each do |text|
-            features = text.word_hash
+            features = text.word_hash(@min_word_length)
             features.each_key { |word| @vocabulary[word] = true }
             @training_data << { category: category, features: features }
           end
@@ -463,7 +483,7 @@ module Classifier
       category = category.to_s.prepare_category_name
       raise StandardError, "No such category: #{category}" unless @categories.include?(category)
-      features = text.word_hash
+      features = text.word_hash(@min_word_length)
       synchronize do
         features.each_key { |word| @vocabulary[word] = true }
         @training_data << { category: category, features: features }
@@ -570,6 +590,7 @@ module Classifier
       @fitted = data.fetch('fitted', true)
       @dirty = false
       @storage = nil
+      @min_word_length = data['min_word_length'] || Classifier.config.min_word_length
     end
     def restore_weights_and_bias(data)

data/lib/classifier/lsi.rb CHANGED Viewed

@@ -80,6 +80,7 @@ module Classifier
     # @rbs @u_matrix: Matrix?
     # @rbs @max_rank: Integer
     # @rbs @initial_vocab_size: Integer?
+    # @rbs @min_word_length: Integer
     attr_reader :word_list, :singular_values
     attr_accessor :auto_rebuild, :storage
@@ -110,6 +111,7 @@ module Classifier
       @max_rank = options[:max_rank] || DEFAULT_MAX_RANK
       @u_matrix = nil
       @initial_vocab_size = nil
+      @min_word_length = options[:min_word_length] || Classifier.config.min_word_length
     end
     # Returns true if the index needs to be rebuilt.  The index needs
@@ -216,7 +218,13 @@ module Classifier
     #
     # @rbs (String, *String | Symbol) ?{ (String) -> String } -> void
     def add_item(item, *categories, &block)
-      clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+      clean_word_hash =
+        if block
+          block.call(item).clean_word_hash(@min_word_length)
+        else
+          item.to_s.clean_word_hash(@min_word_length)
+        end
       node = nil
       synchronize do
@@ -480,14 +488,15 @@ module Classifier
     # Custom marshal serialization to exclude mutex state
     # @rbs () -> Array[untyped]
     def marshal_dump
-      [@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty]
+      [@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty, @min_word_length]
     end
     # Custom marshal deserialization to recreate mutex
     # @rbs (Array[untyped]) -> void
     def marshal_load(data)
       mu_initialize
-      @auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty = data
+      @auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty,
+        @min_word_length = data
       @storage = nil
     end
@@ -653,21 +662,22 @@ module Classifier
     #     puts "#{progress.completed} documents processed"
     #   end
     #
-    # @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
-    def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
-      original_auto_rebuild = @auto_rebuild
-      @auto_rebuild = false
+    # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
+    # @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
+    def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
+      # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
+      raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
+      raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
+      pairs = category && io ? { category => io } : categories
+      pairs.each_value do |io|
+        raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
+      end
       begin
-        reader = Streaming::LineReader.new(io, batch_size: batch_size)
-        total = reader.estimate_line_count
-        progress = Streaming::Progress.new(total: total)
-        reader.each_batch do |batch|
-          batch.each { |text| add_item(text, category) }
-          progress.completed += batch.size
-          progress.current_batch += 1
-          yield progress if block_given?
+        original_auto_rebuild = @auto_rebuild
+        @auto_rebuild = false
+        pairs.each do |cat, stream|
+          stream_train_category(cat, stream, batch_size:, &)
         end
       ensure
         @auto_rebuild = original_auto_rebuild
@@ -720,6 +730,21 @@ module Classifier
     private
+    # Trains from an IO stream with a single category.
+    # @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
+    def stream_train_category(category, io, batch_size:)
+      reader = Streaming::LineReader.new(io, batch_size: batch_size)
+      total = reader.estimate_line_count
+      progress = Streaming::Progress.new(total: total)
+      reader.each_batch do |batch|
+        batch.each { |text| add_item(text, category) }
+        progress.completed += batch.size
+        progress.current_batch += 1
+        yield progress if block_given?
+      end
+    end
     # Restores LSI state from a JSON string (used by reload)
     # @rbs (String) -> void
     def restore_from_json(json)

data/lib/classifier/streaming.rb CHANGED Viewed

@@ -26,8 +26,8 @@ module Classifier
     # Trains the classifier from an IO stream.
     # Each line in the stream is treated as a separate document.
     #
-    # @rbs (Symbol | String, IO, ?batch_size: Integer) { (Progress) -> void } -> void
-    def train_from_stream(category, io, batch_size: DEFAULT_BATCH_SIZE, &block)
+    # @rbs (?(Symbol | String | nil), ?IO?, ?batch_size: Integer, **IO) { (Progress) -> void } -> void
+    def train_from_stream(category = nil, io = nil, batch_size: DEFAULT_BATCH_SIZE, **categories, &block)
       raise NotImplementedError, "#{self.class} must implement train_from_stream"
     end

data/lib/classifier/tfidf.rb CHANGED Viewed

@@ -28,6 +28,7 @@ module Classifier
     # @rbs @fitted: bool
     # @rbs @dirty: bool
     # @rbs @storage: Storage::Base?
+    # @rbs @min_word_length: Integer
     attr_reader :vocabulary, :idf, :num_documents
     attr_accessor :storage
@@ -36,10 +37,12 @@ module Classifier
     # - min_df/max_df: filter terms by document frequency (Integer for count, Float for proportion)
     # - ngram_range: [1,1] for unigrams, [1,2] for unigrams+bigrams
     # - sublinear_tf: use 1 + log(tf) instead of raw term frequency
+    # - min_word_length: minimum word length filter in tokenization
     #
     # @rbs (?min_df: Integer | Float, ?max_df: Integer | Float,
-    #       ?ngram_range: Array[Integer], ?sublinear_tf: bool) -> void
-    def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false)
+    #       ?ngram_range: Array[Integer], ?sublinear_tf: bool, ?min_word_length: Integer) -> void
+    def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false,
+                   min_word_length: Classifier.config.min_word_length)
       validate_df!(min_df, 'min_df')
       validate_df!(max_df, 'max_df')
       validate_ngram_range!(ngram_range)
@@ -54,6 +57,7 @@ module Classifier
       @fitted = false
       @dirty = false
       @storage = nil
+      @min_word_length = min_word_length
     end
     # Learns vocabulary and IDF weights from the corpus.
@@ -204,7 +208,8 @@ module Classifier
         vocabulary: @vocabulary,
         idf: @idf,
         num_documents: @num_documents,
-        fitted: @fitted
+        fitted: @fitted,
+        min_word_length: @min_word_length
       }
     end
@@ -223,7 +228,8 @@ module Classifier
         min_df: data['min_df'],
         max_df: data['max_df'],
         ngram_range: data['ngram_range'],
-        sublinear_tf: data['sublinear_tf']
+        sublinear_tf: data['sublinear_tf'],
+        min_word_length: data['min_word_length'] || Classifier.config.min_word_length
       )
       instance.instance_variable_set(:@vocabulary, symbolize_keys(data['vocabulary']))
@@ -238,12 +244,14 @@ module Classifier
     # @rbs () -> Array[untyped]
     def marshal_dump
-      [@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted]
+      [@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted,
+       @min_word_length]
     end
     # @rbs (Array[untyped]) -> void
     def marshal_load(data)
-      @min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted = data
+      @min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted,
+        @min_word_length = data
       @dirty = false
       @storage = nil
     end
@@ -334,7 +342,7 @@ module Classifier
       result = Hash.new(0)
       if @ngram_range[0] <= 1
-        word_hash = document.clean_word_hash
+        word_hash = document.clean_word_hash(@min_word_length)
         word_hash.each { |term, count| result[term] += count }
       end

data/lib/classifier/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Classifier
-  VERSION = '2.3.2'.freeze
+  VERSION = '2.5.0'.freeze
 end

data/lib/classifier.rb CHANGED Viewed

@@ -36,3 +36,4 @@ require 'classifier/lsi'
 require 'classifier/knn'
 require 'classifier/tfidf'
 require 'classifier/logistic_regression'
+require 'classifier/config'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: classifier
 version: !ruby/object:Gem::Version
-  version: 2.3.2
+  version: 2.5.0
 platform: ruby
 authors:
 - Lucas Carlson
@@ -162,6 +162,7 @@ files:
 - lib/classifier.rb
 - lib/classifier/bayes.rb
 - lib/classifier/cli.rb
+- lib/classifier/config.rb
 - lib/classifier/errors.rb
 - lib/classifier/extensions/string.rb
 - lib/classifier/extensions/vector.rb
@@ -213,7 +214,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 4.0.3
+rubygems_version: 4.0.10
 specification_version: 4
 summary: Text classification with Bayesian, LSI, Logistic Regression, kNN, and TF-IDF
   vectorization.