RubyGems - classifier - Versions diffs - 2.3.2 → 2.4.0 - Mend

classifier 2.3.2 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/README.md +21 -5
data/lib/classifier/bayes.rb +12 -7
data/lib/classifier/config.rb +31 -0
data/lib/classifier/extensions/word_hash.rb +9 -9
data/lib/classifier/logistic_regression.rb +18 -10
data/lib/classifier/lsi.rb +12 -3
data/lib/classifier/tfidf.rb +15 -7
data/lib/classifier/version.rb +1 -1
data/lib/classifier.rb +1 -0
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ef8c75a0dfe0e6da3e2a7cf49d970c738913515b2f29f8759b8c2bc0df5ac8d0
-  data.tar.gz: 03b67f752c656af13f87edfd32fe56fe201e1a0521978b06ed6139a5d01569a9
+  metadata.gz: c30fac948021b0009e53c7c4a232ac3e2472707fd7fb476cbb3f36e4912af399
+  data.tar.gz: 7120fa872d6ae6b49a8117c6dd672f2ee63d8b62384b348c7db8ca8cf790f88a
 SHA512:
-  metadata.gz: 42ac44b3698b864e27a47a79c468971e4c33f73e465a0c552715a93422e1cbbe009c57275378a330dd00607d4d3048590c805373e85f5bf31ecfaca4504ed311
-  data.tar.gz: 41fe7ff430019ff41a97a110784419049914f3a86cdcc383349c58770ab5419cbcb6bb99ae930d08058e1003d4efc8026616317b2096352abe473d54203e99c5
+  metadata.gz: e9250266207fe481dfec4d09fc8d30a97e591649edcea4d0a975a5a59c379819341fa46b219ee3ecc218cbb1c156fd708bb288478b322dde81333000abecf43f
+  data.tar.gz: 7dbf662acb9b5819dd77224690668654211de7abe2c0b009fc059696880479401bd0e2e503f002b6862da955bf794990b00784856751438898c999b2f9cc49f6

data/README.md CHANGED Viewed

@@ -27,7 +27,7 @@ gem 'classifier'
 Or install via Homebrew for CLI-only usage:
 ```bash
-brew install classifier
+brew install cardmagic/tap/classifier
 ```
 ## Command Line
@@ -36,15 +36,15 @@ Classify text instantly with pre-trained models—no coding required:
 ```bash
 # Detect spam
-classifier classify "You won a free iPhone!" -r sms-spam-filter
+classifier -r sms-spam-filter "You won a free iPhone"
 # => spam
 # Analyze sentiment
-classifier classify "This movie was absolutely amazing!" -r imdb-sentiment
+classifier -r imdb-sentiment "This movie was absolutely amazing"
 # => positive
 # Detect emotions
-classifier classify "I'm so happy today!" -r emotion-detection
+classifier -r emotion-detection "I am so happy today"
 # => joy
 # List all available models
@@ -59,12 +59,28 @@ classifier train positive reviews/good/*.txt
 classifier train negative reviews/bad/*.txt
 # Classify new text
-classifier classify "Great product, highly recommend"
+classifier "Great product, highly recommend"
 # => positive
 ```
 [CLI Guide →](https://rubyclassifier.com/docs/guides/cli/basics)
+### Claude Code Plugin
+Install as a plugin to get skills (auto-invoked) and slash commands:
+```bash
+# Add the marketplace
+claude plugin marketplace add cardmagic/ai-marketplace
+# Install the plugin
+claude plugin install classifier@cardmagic
+```
+This gives you:
+- **Skill**: Claude automatically classifies text when you ask about spam, sentiment, or emotions
+- **Slash commands**: `/classifier:classify`, `/classifier:train`, `/classifier:models`
 ## Quick Start
 ### Bayesian

data/lib/classifier/bayes.rb CHANGED Viewed

@@ -20,6 +20,7 @@ module Classifier
     # @rbs @cached_vocab_size: Integer?
     # @rbs @dirty: bool
     # @rbs @storage: Storage::Base?
+    # @rbs @min_word_length: Integer
     attr_accessor :storage
@@ -27,8 +28,9 @@ module Classifier
     # initialized and given a training method. E.g.,
     #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
     #      b = Classifier::Bayes.new ['Interesting', 'Uninteresting', 'Spam']
-    # @rbs (*String | Symbol | Array[String | Symbol]) -> void
-    def initialize(*categories)
+    #      b = Classifier::Bayes.new 'Spam', min_word_length: 1
+    # @rbs (*String | Symbol | Array[String | Symbol], ?min_word_length: Integer) -> void
+    def initialize(*categories, min_word_length: Classifier.config.min_word_length)
       super()
       @categories = {}
       categories.flatten.each { |category| @categories[category.prepare_category_name] = {} }
@@ -39,6 +41,7 @@ module Classifier
       @cached_vocab_size = nil
       @dirty = false
       @storage = nil
+      @min_word_length = min_word_length
     end
     # Trains the classifier with text for a category.
@@ -76,7 +79,7 @@ module Classifier
     #
     # @rbs (String) -> Hash[String, Float]
     def classifications(text)
-      words = text.word_hash.keys
+      words = text.word_hash(@min_word_length).keys
       synchronize do
         training_count = cached_training_count
         vocab_size = cached_vocab_size
@@ -117,7 +120,8 @@ module Classifier
         categories: @categories.transform_keys(&:to_s).transform_values { |v| v.transform_keys(&:to_s) },
         total_words: @total_words,
         category_counts: @category_counts.transform_keys(&:to_s),
-        category_word_count: @category_word_count.transform_keys(&:to_s)
+        category_word_count: @category_word_count.transform_keys(&:to_s),
+        min_word_length: @min_word_length
       }
     end
@@ -409,7 +413,7 @@ module Classifier
         invalidate_caches
         @dirty = true
         batch.each do |text|
-          word_hash = text.word_hash
+          word_hash = text.word_hash(@min_word_length)
           @category_counts[category] += 1
           word_hash.each do |word, count|
             @categories[category][word] ||= 0
@@ -425,7 +429,7 @@ module Classifier
     # @rbs (String | Symbol, String) -> void
     def train_single(category, text)
       category = category.prepare_category_name
-      word_hash = text.word_hash
+      word_hash = text.word_hash(@min_word_length)
       synchronize do
         invalidate_caches
         @dirty = true
@@ -443,7 +447,7 @@ module Classifier
     # @rbs (String | Symbol, String) -> void
     def untrain_single(category, text)
       category = category.prepare_category_name
-      word_hash = text.word_hash
+      word_hash = text.word_hash(@min_word_length)
       synchronize do
         invalidate_caches
         @dirty = true
@@ -487,6 +491,7 @@ module Classifier
       @cached_vocab_size = nil
       @dirty = false
       @storage = nil
+      @min_word_length = data['min_word_length'] || Classifier.config.min_word_length
       data['categories'].each do |cat_name, words|
         @categories[cat_name.to_sym] = words.transform_keys(&:to_sym)

data/lib/classifier/config.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# rbs_inline: enabled
+module Classifier
+  # @rbs @config: Config?
+  # This lazy initialization is not thread-safe.
+  # In multi-threaded environments, ensure this method is called
+  # or configuration is set explicitly during startup before using classifiers.
+  # @rbs () -> Config
+  def config
+    @config ||= Config.new
+  end
+  # @rbs () { (Config) -> void } -> void
+  def configure(&block)
+    block&.call(config)
+  end
+  module_function :config, :configure
+  class Config
+    # @rbs @min_word_length: Integer
+    attr_accessor :min_word_length #: Integer
+    # @rbs () -> void
+    def initialize
+      @min_word_length = 3
+    end
+  end
+end

data/lib/classifier/extensions/word_hash.rb CHANGED Viewed

@@ -20,27 +20,27 @@ class String
   # Return a Hash of strings => ints. Each word in the string is stemmed,
   # interned, and indexes to its frequency in the document.
-  # @rbs () -> Hash[Symbol, Integer]
-  def word_hash
-    word_hash = clean_word_hash
+  # @rbs (?Integer) -> Hash[Symbol, Integer]
+  def word_hash(min_word_length = 3)
+    word_hash = clean_word_hash(min_word_length)
     symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
     word_hash.merge(symbol_hash)
   end
   # Return a word hash without extra punctuation or short symbols, just stemmed words
-  # @rbs () -> Hash[Symbol, Integer]
-  def clean_word_hash
-    word_hash_for_words gsub(/[^\w\s]/, '').split
+  # @rbs (?Integer) -> Hash[Symbol, Integer]
+  def clean_word_hash(min_word_length = 3)
+    word_hash_for_words(gsub(/[^\w\s]/, '').split, min_word_length)
   end
   private
-  # @rbs (Array[String]) -> Hash[Symbol, Integer]
-  def word_hash_for_words(words)
+  # @rbs (Array[String], Integer) -> Hash[Symbol, Integer]
+  def word_hash_for_words(words, min_word_length)
     d = Hash.new(0)
     words.each do |word|
       word.downcase!
-      d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
+      d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length >= min_word_length
     end
     d
   end

data/lib/classifier/logistic_regression.rb CHANGED Viewed

@@ -34,6 +34,7 @@ module Classifier
     # @rbs @fitted: bool
     # @rbs @dirty: bool
     # @rbs @storage: Storage::Base?
+    # @rbs @min_word_length: Integer
     attr_accessor :storage
@@ -53,13 +54,16 @@ module Classifier
     # - regularization: L2 regularization strength (default: 0.01)
     # - max_iterations: Maximum training iterations (default: 100)
     # - tolerance: Convergence threshold (default: 1e-4)
+    # - min_word_length: Minimum word length filter in tokenization
     #
     # @rbs (*String | Symbol | Array[String | Symbol], ?learning_rate: Float, ?regularization: Float,
-    #       ?max_iterations: Integer, ?tolerance: Float) -> void
+    #       ?max_iterations: Integer, ?tolerance: Float, ?min_word_length: Integer) -> void
+    # rubocop:disable Metrics/ParameterLists
     def initialize(*categories, learning_rate: DEFAULT_LEARNING_RATE,
                    regularization: DEFAULT_REGULARIZATION,
                    max_iterations: DEFAULT_MAX_ITERATIONS,
-                   tolerance: DEFAULT_TOLERANCE)
+                   tolerance: DEFAULT_TOLERANCE,
+                   min_word_length: Classifier.config.min_word_length)
       super()
       categories = categories.flatten
       @categories = categories.map { |c| c.to_s.prepare_category_name }
@@ -74,7 +78,9 @@ module Classifier
       @fitted = false
       @dirty = false
       @storage = nil
+      @min_word_length = min_word_length
     end
+    # rubocop:enable Metrics/ParameterLists
     # Trains the classifier with text for a category.
     #
@@ -130,7 +136,7 @@ module Classifier
     def probabilities(text)
       raise NotFittedError, 'Model not fitted. Call fit() after training.' unless @fitted
-      features = text.word_hash
+      features = text.word_hash(@min_word_length)
       synchronize do
         softmax(compute_scores(features))
       end
@@ -143,7 +149,7 @@ module Classifier
     def classifications(text)
       raise NotFittedError, 'Model not fitted. Call fit() after training.' unless @fitted
-      features = text.word_hash
+      features = text.word_hash(@min_word_length)
       synchronize do
         compute_scores(features).transform_keys(&:to_s)
       end
@@ -239,7 +245,8 @@ module Classifier
         regularization: @regularization,
         max_iterations: @max_iterations,
         tolerance: @tolerance,
-        fitted: @fitted
+        fitted: @fitted,
+        min_word_length: @min_word_length
       }
     end
@@ -336,7 +343,7 @@ module Classifier
     def marshal_dump
       fit unless @fitted
       [@categories, @weights, @bias, @vocabulary, @learning_rate, @regularization,
-       @max_iterations, @tolerance, @fitted]
+       @max_iterations, @tolerance, @fitted, @min_word_length]
     end
     # Custom marshal deserialization to recreate mutex.
@@ -345,7 +352,7 @@ module Classifier
     def marshal_load(data)
       mu_initialize
       @categories, @weights, @bias, @vocabulary, @learning_rate, @regularization,
-        @max_iterations, @tolerance, @fitted = data
+        @max_iterations, @tolerance, @fitted, @min_word_length = data
       @training_data = []
       @dirty = false
       @storage = nil
@@ -395,7 +402,7 @@ module Classifier
       reader.each_batch do |batch|
         synchronize do
           batch.each do |text|
-            features = text.word_hash
+            features = text.word_hash(@min_word_length)
             features.each_key { |word| @vocabulary[word] = true }
             @training_data << { category: category, features: features }
           end
@@ -444,7 +451,7 @@ module Classifier
       documents.each_slice(batch_size) do |batch|
         synchronize do
           batch.each do |text|
-            features = text.word_hash
+            features = text.word_hash(@min_word_length)
             features.each_key { |word| @vocabulary[word] = true }
             @training_data << { category: category, features: features }
           end
@@ -463,7 +470,7 @@ module Classifier
       category = category.to_s.prepare_category_name
       raise StandardError, "No such category: #{category}" unless @categories.include?(category)
-      features = text.word_hash
+      features = text.word_hash(@min_word_length)
       synchronize do
         features.each_key { |word| @vocabulary[word] = true }
         @training_data << { category: category, features: features }
@@ -570,6 +577,7 @@ module Classifier
       @fitted = data.fetch('fitted', true)
       @dirty = false
       @storage = nil
+      @min_word_length = data['min_word_length'] || Classifier.config.min_word_length
     end
     def restore_weights_and_bias(data)

data/lib/classifier/lsi.rb CHANGED Viewed

@@ -80,6 +80,7 @@ module Classifier
     # @rbs @u_matrix: Matrix?
     # @rbs @max_rank: Integer
     # @rbs @initial_vocab_size: Integer?
+    # @rbs @min_word_length: Integer
     attr_reader :word_list, :singular_values
     attr_accessor :auto_rebuild, :storage
@@ -110,6 +111,7 @@ module Classifier
       @max_rank = options[:max_rank] || DEFAULT_MAX_RANK
       @u_matrix = nil
       @initial_vocab_size = nil
+      @min_word_length = options[:min_word_length] || Classifier.config.min_word_length
     end
     # Returns true if the index needs to be rebuilt.  The index needs
@@ -216,7 +218,13 @@ module Classifier
     #
     # @rbs (String, *String | Symbol) ?{ (String) -> String } -> void
     def add_item(item, *categories, &block)
-      clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
+      clean_word_hash =
+        if block
+          block.call(item).clean_word_hash(@min_word_length)
+        else
+          item.to_s.clean_word_hash(@min_word_length)
+        end
       node = nil
       synchronize do
@@ -480,14 +488,15 @@ module Classifier
     # Custom marshal serialization to exclude mutex state
     # @rbs () -> Array[untyped]
     def marshal_dump
-      [@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty]
+      [@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty, @min_word_length]
     end
     # Custom marshal deserialization to recreate mutex
     # @rbs (Array[untyped]) -> void
     def marshal_load(data)
       mu_initialize
-      @auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty = data
+      @auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty,
+        @min_word_length = data
       @storage = nil
     end

data/lib/classifier/tfidf.rb CHANGED Viewed

@@ -28,6 +28,7 @@ module Classifier
     # @rbs @fitted: bool
     # @rbs @dirty: bool
     # @rbs @storage: Storage::Base?
+    # @rbs @min_word_length: Integer
     attr_reader :vocabulary, :idf, :num_documents
     attr_accessor :storage
@@ -36,10 +37,12 @@ module Classifier
     # - min_df/max_df: filter terms by document frequency (Integer for count, Float for proportion)
     # - ngram_range: [1,1] for unigrams, [1,2] for unigrams+bigrams
     # - sublinear_tf: use 1 + log(tf) instead of raw term frequency
+    # - min_word_length: minimum word length filter in tokenization
     #
     # @rbs (?min_df: Integer | Float, ?max_df: Integer | Float,
-    #       ?ngram_range: Array[Integer], ?sublinear_tf: bool) -> void
-    def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false)
+    #       ?ngram_range: Array[Integer], ?sublinear_tf: bool, ?min_word_length: Integer) -> void
+    def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false,
+                   min_word_length: Classifier.config.min_word_length)
       validate_df!(min_df, 'min_df')
       validate_df!(max_df, 'max_df')
       validate_ngram_range!(ngram_range)
@@ -54,6 +57,7 @@ module Classifier
       @fitted = false
       @dirty = false
       @storage = nil
+      @min_word_length = min_word_length
     end
     # Learns vocabulary and IDF weights from the corpus.
@@ -204,7 +208,8 @@ module Classifier
         vocabulary: @vocabulary,
         idf: @idf,
         num_documents: @num_documents,
-        fitted: @fitted
+        fitted: @fitted,
+        min_word_length: @min_word_length
       }
     end
@@ -223,7 +228,8 @@ module Classifier
         min_df: data['min_df'],
         max_df: data['max_df'],
         ngram_range: data['ngram_range'],
-        sublinear_tf: data['sublinear_tf']
+        sublinear_tf: data['sublinear_tf'],
+        min_word_length: data['min_word_length'] || Classifier.config.min_word_length
       )
       instance.instance_variable_set(:@vocabulary, symbolize_keys(data['vocabulary']))
@@ -238,12 +244,14 @@ module Classifier
     # @rbs () -> Array[untyped]
     def marshal_dump
-      [@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted]
+      [@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted,
+       @min_word_length]
     end
     # @rbs (Array[untyped]) -> void
     def marshal_load(data)
-      @min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted = data
+      @min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted,
+        @min_word_length = data
       @dirty = false
       @storage = nil
     end
@@ -334,7 +342,7 @@ module Classifier
       result = Hash.new(0)
       if @ngram_range[0] <= 1
-        word_hash = document.clean_word_hash
+        word_hash = document.clean_word_hash(@min_word_length)
         word_hash.each { |term, count| result[term] += count }
       end

data/lib/classifier/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Classifier
-  VERSION = '2.3.2'.freeze
+  VERSION = '2.4.0'.freeze
 end

data/lib/classifier.rb CHANGED Viewed

@@ -36,3 +36,4 @@ require 'classifier/lsi'
 require 'classifier/knn'
 require 'classifier/tfidf'
 require 'classifier/logistic_regression'
+require 'classifier/config'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: classifier
 version: !ruby/object:Gem::Version
-  version: 2.3.2
+  version: 2.4.0
 platform: ruby
 authors:
 - Lucas Carlson
@@ -162,6 +162,7 @@ files:
 - lib/classifier.rb
 - lib/classifier/bayes.rb
 - lib/classifier/cli.rb
+- lib/classifier/config.rb
 - lib/classifier/errors.rb
 - lib/classifier/extensions/string.rb
 - lib/classifier/extensions/vector.rb
@@ -213,7 +214,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 4.0.3
+rubygems_version: 4.0.10
 specification_version: 4
 summary: Text classification with Bayesian, LSI, Logistic Regression, kNN, and TF-IDF
   vectorization.