classifier 2.3.2 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ef8c75a0dfe0e6da3e2a7cf49d970c738913515b2f29f8759b8c2bc0df5ac8d0
4
- data.tar.gz: 03b67f752c656af13f87edfd32fe56fe201e1a0521978b06ed6139a5d01569a9
3
+ metadata.gz: c30fac948021b0009e53c7c4a232ac3e2472707fd7fb476cbb3f36e4912af399
4
+ data.tar.gz: 7120fa872d6ae6b49a8117c6dd672f2ee63d8b62384b348c7db8ca8cf790f88a
5
5
  SHA512:
6
- metadata.gz: 42ac44b3698b864e27a47a79c468971e4c33f73e465a0c552715a93422e1cbbe009c57275378a330dd00607d4d3048590c805373e85f5bf31ecfaca4504ed311
7
- data.tar.gz: 41fe7ff430019ff41a97a110784419049914f3a86cdcc383349c58770ab5419cbcb6bb99ae930d08058e1003d4efc8026616317b2096352abe473d54203e99c5
6
+ metadata.gz: e9250266207fe481dfec4d09fc8d30a97e591649edcea4d0a975a5a59c379819341fa46b219ee3ecc218cbb1c156fd708bb288478b322dde81333000abecf43f
7
+ data.tar.gz: 7dbf662acb9b5819dd77224690668654211de7abe2c0b009fc059696880479401bd0e2e503f002b6862da955bf794990b00784856751438898c999b2f9cc49f6
data/README.md CHANGED
@@ -27,7 +27,7 @@ gem 'classifier'
27
27
  Or install via Homebrew for CLI-only usage:
28
28
 
29
29
  ```bash
30
- brew install classifier
30
+ brew install cardmagic/tap/classifier
31
31
  ```
32
32
 
33
33
  ## Command Line
@@ -36,15 +36,15 @@ Classify text instantly with pre-trained models—no coding required:
36
36
 
37
37
  ```bash
38
38
  # Detect spam
39
- classifier classify "You won a free iPhone!" -r sms-spam-filter
39
+ classifier -r sms-spam-filter "You won a free iPhone"
40
40
  # => spam
41
41
 
42
42
  # Analyze sentiment
43
- classifier classify "This movie was absolutely amazing!" -r imdb-sentiment
43
+ classifier -r imdb-sentiment "This movie was absolutely amazing"
44
44
  # => positive
45
45
 
46
46
  # Detect emotions
47
- classifier classify "I'm so happy today!" -r emotion-detection
47
+ classifier -r emotion-detection "I am so happy today"
48
48
  # => joy
49
49
 
50
50
  # List all available models
@@ -59,12 +59,28 @@ classifier train positive reviews/good/*.txt
59
59
  classifier train negative reviews/bad/*.txt
60
60
 
61
61
  # Classify new text
62
- classifier classify "Great product, highly recommend"
62
+ classifier "Great product, highly recommend"
63
63
  # => positive
64
64
  ```
65
65
 
66
66
  [CLI Guide →](https://rubyclassifier.com/docs/guides/cli/basics)
67
67
 
68
+ ### Claude Code Plugin
69
+
70
+ Install as a plugin to get skills (auto-invoked) and slash commands:
71
+
72
+ ```bash
73
+ # Add the marketplace
74
+ claude plugin marketplace add cardmagic/ai-marketplace
75
+
76
+ # Install the plugin
77
+ claude plugin install classifier@cardmagic
78
+ ```
79
+
80
+ This gives you:
81
+ - **Skill**: Claude automatically classifies text when you ask about spam, sentiment, or emotions
82
+ - **Slash commands**: `/classifier:classify`, `/classifier:train`, `/classifier:models`
83
+
68
84
  ## Quick Start
69
85
 
70
86
  ### Bayesian
@@ -20,6 +20,7 @@ module Classifier
20
20
  # @rbs @cached_vocab_size: Integer?
21
21
  # @rbs @dirty: bool
22
22
  # @rbs @storage: Storage::Base?
23
+ # @rbs @min_word_length: Integer
23
24
 
24
25
  attr_accessor :storage
25
26
 
@@ -27,8 +28,9 @@ module Classifier
27
28
  # initialized and given a training method. E.g.,
28
29
  # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
29
30
  # b = Classifier::Bayes.new ['Interesting', 'Uninteresting', 'Spam']
30
- # @rbs (*String | Symbol | Array[String | Symbol]) -> void
31
- def initialize(*categories)
31
+ # b = Classifier::Bayes.new 'Spam', min_word_length: 1
32
+ # @rbs (*String | Symbol | Array[String | Symbol], ?min_word_length: Integer) -> void
33
+ def initialize(*categories, min_word_length: Classifier.config.min_word_length)
32
34
  super()
33
35
  @categories = {}
34
36
  categories.flatten.each { |category| @categories[category.prepare_category_name] = {} }
@@ -39,6 +41,7 @@ module Classifier
39
41
  @cached_vocab_size = nil
40
42
  @dirty = false
41
43
  @storage = nil
44
+ @min_word_length = min_word_length
42
45
  end
43
46
 
44
47
  # Trains the classifier with text for a category.
@@ -76,7 +79,7 @@ module Classifier
76
79
  #
77
80
  # @rbs (String) -> Hash[String, Float]
78
81
  def classifications(text)
79
- words = text.word_hash.keys
82
+ words = text.word_hash(@min_word_length).keys
80
83
  synchronize do
81
84
  training_count = cached_training_count
82
85
  vocab_size = cached_vocab_size
@@ -117,7 +120,8 @@ module Classifier
117
120
  categories: @categories.transform_keys(&:to_s).transform_values { |v| v.transform_keys(&:to_s) },
118
121
  total_words: @total_words,
119
122
  category_counts: @category_counts.transform_keys(&:to_s),
120
- category_word_count: @category_word_count.transform_keys(&:to_s)
123
+ category_word_count: @category_word_count.transform_keys(&:to_s),
124
+ min_word_length: @min_word_length
121
125
  }
122
126
  end
123
127
 
@@ -409,7 +413,7 @@ module Classifier
409
413
  invalidate_caches
410
414
  @dirty = true
411
415
  batch.each do |text|
412
- word_hash = text.word_hash
416
+ word_hash = text.word_hash(@min_word_length)
413
417
  @category_counts[category] += 1
414
418
  word_hash.each do |word, count|
415
419
  @categories[category][word] ||= 0
@@ -425,7 +429,7 @@ module Classifier
425
429
  # @rbs (String | Symbol, String) -> void
426
430
  def train_single(category, text)
427
431
  category = category.prepare_category_name
428
- word_hash = text.word_hash
432
+ word_hash = text.word_hash(@min_word_length)
429
433
  synchronize do
430
434
  invalidate_caches
431
435
  @dirty = true
@@ -443,7 +447,7 @@ module Classifier
443
447
  # @rbs (String | Symbol, String) -> void
444
448
  def untrain_single(category, text)
445
449
  category = category.prepare_category_name
446
- word_hash = text.word_hash
450
+ word_hash = text.word_hash(@min_word_length)
447
451
  synchronize do
448
452
  invalidate_caches
449
453
  @dirty = true
@@ -487,6 +491,7 @@ module Classifier
487
491
  @cached_vocab_size = nil
488
492
  @dirty = false
489
493
  @storage = nil
494
+ @min_word_length = data['min_word_length'] || Classifier.config.min_word_length
490
495
 
491
496
  data['categories'].each do |cat_name, words|
492
497
  @categories[cat_name.to_sym] = words.transform_keys(&:to_sym)
@@ -0,0 +1,31 @@
1
+ # rbs_inline: enabled
2
+
3
+ module Classifier
4
+ # @rbs @config: Config?
5
+
6
+ # This lazy initialization is not thread-safe.
7
+ # In multi-threaded environments, ensure this method is called
8
+ # or configuration is set explicitly during startup before using classifiers.
9
+ # @rbs () -> Config
10
+ def config
11
+ @config ||= Config.new
12
+ end
13
+
14
+ # @rbs () { (Config) -> void } -> void
15
+ def configure(&block)
16
+ block&.call(config)
17
+ end
18
+
19
+ module_function :config, :configure
20
+
21
+ class Config
22
+ # @rbs @min_word_length: Integer
23
+
24
+ attr_accessor :min_word_length #: Integer
25
+
26
+ # @rbs () -> void
27
+ def initialize
28
+ @min_word_length = 3
29
+ end
30
+ end
31
+ end
@@ -20,27 +20,27 @@ class String
20
20
 
21
21
  # Return a Hash of strings => ints. Each word in the string is stemmed,
22
22
  # interned, and indexes to its frequency in the document.
23
- # @rbs () -> Hash[Symbol, Integer]
24
- def word_hash
25
- word_hash = clean_word_hash
23
+ # @rbs (?Integer) -> Hash[Symbol, Integer]
24
+ def word_hash(min_word_length = 3)
25
+ word_hash = clean_word_hash(min_word_length)
26
26
  symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
27
27
  word_hash.merge(symbol_hash)
28
28
  end
29
29
 
30
30
  # Return a word hash without extra punctuation or short symbols, just stemmed words
31
- # @rbs () -> Hash[Symbol, Integer]
32
- def clean_word_hash
33
- word_hash_for_words gsub(/[^\w\s]/, '').split
31
+ # @rbs (?Integer) -> Hash[Symbol, Integer]
32
+ def clean_word_hash(min_word_length = 3)
33
+ word_hash_for_words(gsub(/[^\w\s]/, '').split, min_word_length)
34
34
  end
35
35
 
36
36
  private
37
37
 
38
- # @rbs (Array[String]) -> Hash[Symbol, Integer]
39
- def word_hash_for_words(words)
38
+ # @rbs (Array[String], Integer) -> Hash[Symbol, Integer]
39
+ def word_hash_for_words(words, min_word_length)
40
40
  d = Hash.new(0)
41
41
  words.each do |word|
42
42
  word.downcase!
43
- d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
43
+ d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length >= min_word_length
44
44
  end
45
45
  d
46
46
  end
@@ -34,6 +34,7 @@ module Classifier
34
34
  # @rbs @fitted: bool
35
35
  # @rbs @dirty: bool
36
36
  # @rbs @storage: Storage::Base?
37
+ # @rbs @min_word_length: Integer
37
38
 
38
39
  attr_accessor :storage
39
40
 
@@ -53,13 +54,16 @@ module Classifier
53
54
  # - regularization: L2 regularization strength (default: 0.01)
54
55
  # - max_iterations: Maximum training iterations (default: 100)
55
56
  # - tolerance: Convergence threshold (default: 1e-4)
57
+ # - min_word_length: Minimum word length filter in tokenization
56
58
  #
57
59
  # @rbs (*String | Symbol | Array[String | Symbol], ?learning_rate: Float, ?regularization: Float,
58
- # ?max_iterations: Integer, ?tolerance: Float) -> void
60
+ # ?max_iterations: Integer, ?tolerance: Float, ?min_word_length: Integer) -> void
61
+ # rubocop:disable Metrics/ParameterLists
59
62
  def initialize(*categories, learning_rate: DEFAULT_LEARNING_RATE,
60
63
  regularization: DEFAULT_REGULARIZATION,
61
64
  max_iterations: DEFAULT_MAX_ITERATIONS,
62
- tolerance: DEFAULT_TOLERANCE)
65
+ tolerance: DEFAULT_TOLERANCE,
66
+ min_word_length: Classifier.config.min_word_length)
63
67
  super()
64
68
  categories = categories.flatten
65
69
  @categories = categories.map { |c| c.to_s.prepare_category_name }
@@ -74,7 +78,9 @@ module Classifier
74
78
  @fitted = false
75
79
  @dirty = false
76
80
  @storage = nil
81
+ @min_word_length = min_word_length
77
82
  end
83
+ # rubocop:enable Metrics/ParameterLists
78
84
 
79
85
  # Trains the classifier with text for a category.
80
86
  #
@@ -130,7 +136,7 @@ module Classifier
130
136
  def probabilities(text)
131
137
  raise NotFittedError, 'Model not fitted. Call fit() after training.' unless @fitted
132
138
 
133
- features = text.word_hash
139
+ features = text.word_hash(@min_word_length)
134
140
  synchronize do
135
141
  softmax(compute_scores(features))
136
142
  end
@@ -143,7 +149,7 @@ module Classifier
143
149
  def classifications(text)
144
150
  raise NotFittedError, 'Model not fitted. Call fit() after training.' unless @fitted
145
151
 
146
- features = text.word_hash
152
+ features = text.word_hash(@min_word_length)
147
153
  synchronize do
148
154
  compute_scores(features).transform_keys(&:to_s)
149
155
  end
@@ -239,7 +245,8 @@ module Classifier
239
245
  regularization: @regularization,
240
246
  max_iterations: @max_iterations,
241
247
  tolerance: @tolerance,
242
- fitted: @fitted
248
+ fitted: @fitted,
249
+ min_word_length: @min_word_length
243
250
  }
244
251
  end
245
252
 
@@ -336,7 +343,7 @@ module Classifier
336
343
  def marshal_dump
337
344
  fit unless @fitted
338
345
  [@categories, @weights, @bias, @vocabulary, @learning_rate, @regularization,
339
- @max_iterations, @tolerance, @fitted]
346
+ @max_iterations, @tolerance, @fitted, @min_word_length]
340
347
  end
341
348
 
342
349
  # Custom marshal deserialization to recreate mutex.
@@ -345,7 +352,7 @@ module Classifier
345
352
  def marshal_load(data)
346
353
  mu_initialize
347
354
  @categories, @weights, @bias, @vocabulary, @learning_rate, @regularization,
348
- @max_iterations, @tolerance, @fitted = data
355
+ @max_iterations, @tolerance, @fitted, @min_word_length = data
349
356
  @training_data = []
350
357
  @dirty = false
351
358
  @storage = nil
@@ -395,7 +402,7 @@ module Classifier
395
402
  reader.each_batch do |batch|
396
403
  synchronize do
397
404
  batch.each do |text|
398
- features = text.word_hash
405
+ features = text.word_hash(@min_word_length)
399
406
  features.each_key { |word| @vocabulary[word] = true }
400
407
  @training_data << { category: category, features: features }
401
408
  end
@@ -444,7 +451,7 @@ module Classifier
444
451
  documents.each_slice(batch_size) do |batch|
445
452
  synchronize do
446
453
  batch.each do |text|
447
- features = text.word_hash
454
+ features = text.word_hash(@min_word_length)
448
455
  features.each_key { |word| @vocabulary[word] = true }
449
456
  @training_data << { category: category, features: features }
450
457
  end
@@ -463,7 +470,7 @@ module Classifier
463
470
  category = category.to_s.prepare_category_name
464
471
  raise StandardError, "No such category: #{category}" unless @categories.include?(category)
465
472
 
466
- features = text.word_hash
473
+ features = text.word_hash(@min_word_length)
467
474
  synchronize do
468
475
  features.each_key { |word| @vocabulary[word] = true }
469
476
  @training_data << { category: category, features: features }
@@ -570,6 +577,7 @@ module Classifier
570
577
  @fitted = data.fetch('fitted', true)
571
578
  @dirty = false
572
579
  @storage = nil
580
+ @min_word_length = data['min_word_length'] || Classifier.config.min_word_length
573
581
  end
574
582
 
575
583
  def restore_weights_and_bias(data)
@@ -80,6 +80,7 @@ module Classifier
80
80
  # @rbs @u_matrix: Matrix?
81
81
  # @rbs @max_rank: Integer
82
82
  # @rbs @initial_vocab_size: Integer?
83
+ # @rbs @min_word_length: Integer
83
84
 
84
85
  attr_reader :word_list, :singular_values
85
86
  attr_accessor :auto_rebuild, :storage
@@ -110,6 +111,7 @@ module Classifier
110
111
  @max_rank = options[:max_rank] || DEFAULT_MAX_RANK
111
112
  @u_matrix = nil
112
113
  @initial_vocab_size = nil
114
+ @min_word_length = options[:min_word_length] || Classifier.config.min_word_length
113
115
  end
114
116
 
115
117
  # Returns true if the index needs to be rebuilt. The index needs
@@ -216,7 +218,13 @@ module Classifier
216
218
  #
217
219
  # @rbs (String, *String | Symbol) ?{ (String) -> String } -> void
218
220
  def add_item(item, *categories, &block)
219
- clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
221
+ clean_word_hash =
222
+ if block
223
+ block.call(item).clean_word_hash(@min_word_length)
224
+ else
225
+ item.to_s.clean_word_hash(@min_word_length)
226
+ end
227
+
220
228
  node = nil
221
229
 
222
230
  synchronize do
@@ -480,14 +488,15 @@ module Classifier
480
488
  # Custom marshal serialization to exclude mutex state
481
489
  # @rbs () -> Array[untyped]
482
490
  def marshal_dump
483
- [@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty]
491
+ [@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty, @min_word_length]
484
492
  end
485
493
 
486
494
  # Custom marshal deserialization to recreate mutex
487
495
  # @rbs (Array[untyped]) -> void
488
496
  def marshal_load(data)
489
497
  mu_initialize
490
- @auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty = data
498
+ @auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty,
499
+ @min_word_length = data
491
500
  @storage = nil
492
501
  end
493
502
 
@@ -28,6 +28,7 @@ module Classifier
28
28
  # @rbs @fitted: bool
29
29
  # @rbs @dirty: bool
30
30
  # @rbs @storage: Storage::Base?
31
+ # @rbs @min_word_length: Integer
31
32
 
32
33
  attr_reader :vocabulary, :idf, :num_documents
33
34
  attr_accessor :storage
@@ -36,10 +37,12 @@ module Classifier
36
37
  # - min_df/max_df: filter terms by document frequency (Integer for count, Float for proportion)
37
38
  # - ngram_range: [1,1] for unigrams, [1,2] for unigrams+bigrams
38
39
  # - sublinear_tf: use 1 + log(tf) instead of raw term frequency
40
+ # - min_word_length: minimum word length filter in tokenization
39
41
  #
40
42
  # @rbs (?min_df: Integer | Float, ?max_df: Integer | Float,
41
- # ?ngram_range: Array[Integer], ?sublinear_tf: bool) -> void
42
- def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false)
43
+ # ?ngram_range: Array[Integer], ?sublinear_tf: bool, ?min_word_length: Integer) -> void
44
+ def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false,
45
+ min_word_length: Classifier.config.min_word_length)
43
46
  validate_df!(min_df, 'min_df')
44
47
  validate_df!(max_df, 'max_df')
45
48
  validate_ngram_range!(ngram_range)
@@ -54,6 +57,7 @@ module Classifier
54
57
  @fitted = false
55
58
  @dirty = false
56
59
  @storage = nil
60
+ @min_word_length = min_word_length
57
61
  end
58
62
 
59
63
  # Learns vocabulary and IDF weights from the corpus.
@@ -204,7 +208,8 @@ module Classifier
204
208
  vocabulary: @vocabulary,
205
209
  idf: @idf,
206
210
  num_documents: @num_documents,
207
- fitted: @fitted
211
+ fitted: @fitted,
212
+ min_word_length: @min_word_length
208
213
  }
209
214
  end
210
215
 
@@ -223,7 +228,8 @@ module Classifier
223
228
  min_df: data['min_df'],
224
229
  max_df: data['max_df'],
225
230
  ngram_range: data['ngram_range'],
226
- sublinear_tf: data['sublinear_tf']
231
+ sublinear_tf: data['sublinear_tf'],
232
+ min_word_length: data['min_word_length'] || Classifier.config.min_word_length
227
233
  )
228
234
 
229
235
  instance.instance_variable_set(:@vocabulary, symbolize_keys(data['vocabulary']))
@@ -238,12 +244,14 @@ module Classifier
238
244
 
239
245
  # @rbs () -> Array[untyped]
240
246
  def marshal_dump
241
- [@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted]
247
+ [@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted,
248
+ @min_word_length]
242
249
  end
243
250
 
244
251
  # @rbs (Array[untyped]) -> void
245
252
  def marshal_load(data)
246
- @min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted = data
253
+ @min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted,
254
+ @min_word_length = data
247
255
  @dirty = false
248
256
  @storage = nil
249
257
  end
@@ -334,7 +342,7 @@ module Classifier
334
342
  result = Hash.new(0)
335
343
 
336
344
  if @ngram_range[0] <= 1
337
- word_hash = document.clean_word_hash
345
+ word_hash = document.clean_word_hash(@min_word_length)
338
346
  word_hash.each { |term, count| result[term] += count }
339
347
  end
340
348
 
@@ -1,3 +1,3 @@
1
1
  module Classifier
2
- VERSION = '2.3.2'.freeze
2
+ VERSION = '2.4.0'.freeze
3
3
  end
data/lib/classifier.rb CHANGED
@@ -36,3 +36,4 @@ require 'classifier/lsi'
36
36
  require 'classifier/knn'
37
37
  require 'classifier/tfidf'
38
38
  require 'classifier/logistic_regression'
39
+ require 'classifier/config'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.2
4
+ version: 2.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lucas Carlson
@@ -162,6 +162,7 @@ files:
162
162
  - lib/classifier.rb
163
163
  - lib/classifier/bayes.rb
164
164
  - lib/classifier/cli.rb
165
+ - lib/classifier/config.rb
165
166
  - lib/classifier/errors.rb
166
167
  - lib/classifier/extensions/string.rb
167
168
  - lib/classifier/extensions/vector.rb
@@ -213,7 +214,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
213
214
  - !ruby/object:Gem::Version
214
215
  version: '0'
215
216
  requirements: []
216
- rubygems_version: 4.0.3
217
+ rubygems_version: 4.0.10
217
218
  specification_version: 4
218
219
  summary: Text classification with Bayesian, LSI, Logistic Regression, kNN, and TF-IDF
219
220
  vectorization.