classifier 2.3.2 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ef8c75a0dfe0e6da3e2a7cf49d970c738913515b2f29f8759b8c2bc0df5ac8d0
4
- data.tar.gz: 03b67f752c656af13f87edfd32fe56fe201e1a0521978b06ed6139a5d01569a9
3
+ metadata.gz: 539130382c7ce45072e515ee9817f47fb1144af18498a81c27c7dda842477141
4
+ data.tar.gz: 1537f2c7c164ec70c14c7b314148fa81cfb947a93dfe328125082567c5a99a7e
5
5
  SHA512:
6
- metadata.gz: 42ac44b3698b864e27a47a79c468971e4c33f73e465a0c552715a93422e1cbbe009c57275378a330dd00607d4d3048590c805373e85f5bf31ecfaca4504ed311
7
- data.tar.gz: 41fe7ff430019ff41a97a110784419049914f3a86cdcc383349c58770ab5419cbcb6bb99ae930d08058e1003d4efc8026616317b2096352abe473d54203e99c5
6
+ metadata.gz: 3bf1385063c020bb08097192417232a6dc3fce7a16a057bec27ccf62253d69809fedc692f597b7ab6c4e36e8236629188e37add798d883ed670a55cb02c37023
7
+ data.tar.gz: '0528f07ae03ab30b752c3242b747058956831900c55c1c5a858ab623525075806a52fc8b66236de0f4b959ce678f9a24204887434b19c8662860d06173b2634f'
data/README.md CHANGED
@@ -27,7 +27,7 @@ gem 'classifier'
27
27
  Or install via Homebrew for CLI-only usage:
28
28
 
29
29
  ```bash
30
- brew install classifier
30
+ brew install cardmagic/tap/classifier
31
31
  ```
32
32
 
33
33
  ## Command Line
@@ -36,15 +36,15 @@ Classify text instantly with pre-trained models—no coding required:
36
36
 
37
37
  ```bash
38
38
  # Detect spam
39
- classifier classify "You won a free iPhone!" -r sms-spam-filter
39
+ classifier -r sms-spam-filter "You won a free iPhone"
40
40
  # => spam
41
41
 
42
42
  # Analyze sentiment
43
- classifier classify "This movie was absolutely amazing!" -r imdb-sentiment
43
+ classifier -r imdb-sentiment "This movie was absolutely amazing"
44
44
  # => positive
45
45
 
46
46
  # Detect emotions
47
- classifier classify "I'm so happy today!" -r emotion-detection
47
+ classifier -r emotion-detection "I am so happy today"
48
48
  # => joy
49
49
 
50
50
  # List all available models
@@ -59,12 +59,28 @@ classifier train positive reviews/good/*.txt
59
59
  classifier train negative reviews/bad/*.txt
60
60
 
61
61
  # Classify new text
62
- classifier classify "Great product, highly recommend"
62
+ classifier "Great product, highly recommend"
63
63
  # => positive
64
64
  ```
65
65
 
66
66
  [CLI Guide →](https://rubyclassifier.com/docs/guides/cli/basics)
67
67
 
68
+ ### Claude Code Plugin
69
+
70
+ Install as a plugin to get skills (auto-invoked) and slash commands:
71
+
72
+ ```bash
73
+ # Add the marketplace
74
+ claude plugin marketplace add cardmagic/ai-marketplace
75
+
76
+ # Install the plugin
77
+ claude plugin install classifier@cardmagic
78
+ ```
79
+
80
+ This gives you:
81
+ - **Skill**: Claude automatically classifies text when you ask about spam, sentiment, or emotions
82
+ - **Slash commands**: `/classifier:classify`, `/classifier:train`, `/classifier:models`
83
+
68
84
  ## Quick Start
69
85
 
70
86
  ### Bayesian
@@ -20,6 +20,7 @@ module Classifier
20
20
  # @rbs @cached_vocab_size: Integer?
21
21
  # @rbs @dirty: bool
22
22
  # @rbs @storage: Storage::Base?
23
+ # @rbs @min_word_length: Integer
23
24
 
24
25
  attr_accessor :storage
25
26
 
@@ -27,8 +28,9 @@ module Classifier
27
28
  # initialized and given a training method. E.g.,
28
29
  # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
29
30
  # b = Classifier::Bayes.new ['Interesting', 'Uninteresting', 'Spam']
30
- # @rbs (*String | Symbol | Array[String | Symbol]) -> void
31
- def initialize(*categories)
31
+ # b = Classifier::Bayes.new 'Spam', min_word_length: 1
32
+ # @rbs (*String | Symbol | Array[String | Symbol], ?min_word_length: Integer) -> void
33
+ def initialize(*categories, min_word_length: Classifier.config.min_word_length)
32
34
  super()
33
35
  @categories = {}
34
36
  categories.flatten.each { |category| @categories[category.prepare_category_name] = {} }
@@ -39,6 +41,7 @@ module Classifier
39
41
  @cached_vocab_size = nil
40
42
  @dirty = false
41
43
  @storage = nil
44
+ @min_word_length = min_word_length
42
45
  end
43
46
 
44
47
  # Trains the classifier with text for a category.
@@ -76,7 +79,7 @@ module Classifier
76
79
  #
77
80
  # @rbs (String) -> Hash[String, Float]
78
81
  def classifications(text)
79
- words = text.word_hash.keys
82
+ words = text.word_hash(@min_word_length).keys
80
83
  synchronize do
81
84
  training_count = cached_training_count
82
85
  vocab_size = cached_vocab_size
@@ -117,7 +120,8 @@ module Classifier
117
120
  categories: @categories.transform_keys(&:to_s).transform_values { |v| v.transform_keys(&:to_s) },
118
121
  total_words: @total_words,
119
122
  category_counts: @category_counts.transform_keys(&:to_s),
120
- category_word_count: @category_word_count.transform_keys(&:to_s)
123
+ category_word_count: @category_word_count.transform_keys(&:to_s),
124
+ min_word_length: @min_word_length
121
125
  }
122
126
  end
123
127
 
@@ -324,20 +328,14 @@ module Classifier
324
328
  # puts "#{progress.completed} documents processed"
325
329
  # end
326
330
  #
327
- # @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
328
- def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
329
- category = category.prepare_category_name
330
- raise StandardError, "No such category: #{category}" unless @categories.key?(category)
331
-
332
- reader = Streaming::LineReader.new(io, batch_size: batch_size)
333
- total = reader.estimate_line_count
334
- progress = Streaming::Progress.new(total: total)
335
-
336
- reader.each_batch do |batch|
337
- train_batch_internal(category, batch)
338
- progress.completed += batch.size
339
- progress.current_batch += 1
340
- yield progress if block_given?
331
+ # @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
332
+ def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
333
+ raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
334
+ raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
335
+
336
+ pairs = category && io ? { category => io } : categories
337
+ pairs.each do |cat, stream|
338
+ stream_train_category(cat, stream, batch_size: batch_size, &)
341
339
  end
342
340
  end
343
341
 
@@ -385,6 +383,25 @@ module Classifier
385
383
 
386
384
  private
387
385
 
386
+ # Trains from an IO stream with a single category.
387
+ # @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
388
+ def stream_train_category(category, io, batch_size:)
389
+ category = category.prepare_category_name
390
+ raise ArgumentError, "No such category: #{category}" unless @categories.key?(category)
391
+ raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
392
+
393
+ reader = Streaming::LineReader.new(io, batch_size: batch_size)
394
+ total = reader.estimate_line_count
395
+ progress = Streaming::Progress.new(total: total)
396
+
397
+ reader.each_batch do |batch|
398
+ train_batch_internal(category, batch)
399
+ progress.completed += batch.size
400
+ progress.current_batch += 1
401
+ yield progress if block_given?
402
+ end
403
+ end
404
+
388
405
  # Trains a batch of documents for a single category.
389
406
  # @rbs (String | Symbol, Array[String], ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
390
407
  def train_batch_for_category(category, documents, batch_size: Streaming::DEFAULT_BATCH_SIZE)
@@ -409,7 +426,7 @@ module Classifier
409
426
  invalidate_caches
410
427
  @dirty = true
411
428
  batch.each do |text|
412
- word_hash = text.word_hash
429
+ word_hash = text.word_hash(@min_word_length)
413
430
  @category_counts[category] += 1
414
431
  word_hash.each do |word, count|
415
432
  @categories[category][word] ||= 0
@@ -425,7 +442,7 @@ module Classifier
425
442
  # @rbs (String | Symbol, String) -> void
426
443
  def train_single(category, text)
427
444
  category = category.prepare_category_name
428
- word_hash = text.word_hash
445
+ word_hash = text.word_hash(@min_word_length)
429
446
  synchronize do
430
447
  invalidate_caches
431
448
  @dirty = true
@@ -443,7 +460,7 @@ module Classifier
443
460
  # @rbs (String | Symbol, String) -> void
444
461
  def untrain_single(category, text)
445
462
  category = category.prepare_category_name
446
- word_hash = text.word_hash
463
+ word_hash = text.word_hash(@min_word_length)
447
464
  synchronize do
448
465
  invalidate_caches
449
466
  @dirty = true
@@ -487,6 +504,7 @@ module Classifier
487
504
  @cached_vocab_size = nil
488
505
  @dirty = false
489
506
  @storage = nil
507
+ @min_word_length = data['min_word_length'] || Classifier.config.min_word_length
490
508
 
491
509
  data['categories'].each do |cat_name, words|
492
510
  @categories[cat_name.to_sym] = words.transform_keys(&:to_sym)
@@ -0,0 +1,31 @@
1
+ # rbs_inline: enabled
2
+
3
+ module Classifier
4
+ # @rbs @config: Config?
5
+
6
+ # This lazy initialization is not thread-safe.
7
+ # In multi-threaded environments, ensure this method is called
8
+ # or configuration is set explicitly during startup before using classifiers.
9
+ # @rbs () -> Config
10
+ def config
11
+ @config ||= Config.new
12
+ end
13
+
14
+ # @rbs () { (Config) -> void } -> void
15
+ def configure(&block)
16
+ block&.call(config)
17
+ end
18
+
19
+ module_function :config, :configure
20
+
21
+ class Config
22
+ # @rbs @min_word_length: Integer
23
+
24
+ attr_accessor :min_word_length #: Integer
25
+
26
+ # @rbs () -> void
27
+ def initialize
28
+ @min_word_length = 3
29
+ end
30
+ end
31
+ end
@@ -20,27 +20,27 @@ class String
20
20
 
21
21
  # Return a Hash of strings => ints. Each word in the string is stemmed,
22
22
  # interned, and indexes to its frequency in the document.
23
- # @rbs () -> Hash[Symbol, Integer]
24
- def word_hash
25
- word_hash = clean_word_hash
23
+ # @rbs (?Integer) -> Hash[Symbol, Integer]
24
+ def word_hash(min_word_length = 3)
25
+ word_hash = clean_word_hash(min_word_length)
26
26
  symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
27
27
  word_hash.merge(symbol_hash)
28
28
  end
29
29
 
30
30
  # Return a word hash without extra punctuation or short symbols, just stemmed words
31
- # @rbs () -> Hash[Symbol, Integer]
32
- def clean_word_hash
33
- word_hash_for_words gsub(/[^\w\s]/, '').split
31
+ # @rbs (?Integer) -> Hash[Symbol, Integer]
32
+ def clean_word_hash(min_word_length = 3)
33
+ word_hash_for_words(gsub(/[^\w\s]/, '').split, min_word_length)
34
34
  end
35
35
 
36
36
  private
37
37
 
38
- # @rbs (Array[String]) -> Hash[Symbol, Integer]
39
- def word_hash_for_words(words)
38
+ # @rbs (Array[String], Integer) -> Hash[Symbol, Integer]
39
+ def word_hash_for_words(words, min_word_length)
40
40
  d = Hash.new(0)
41
41
  words.each do |word|
42
42
  word.downcase!
43
- d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
43
+ d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length >= min_word_length
44
44
  end
45
45
  d
46
46
  end
@@ -268,9 +268,10 @@ module Classifier
268
268
  # puts "#{progress.completed} documents processed"
269
269
  # end
270
270
  #
271
- # @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
272
- def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE, &block)
273
- @lsi.train_from_stream(category, io, batch_size: batch_size, &block)
271
+ # @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
272
+ def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
273
+ # @type var categories: untype
274
+ @lsi.train_from_stream(category, io, batch_size: batch_size, **categories, &)
274
275
  synchronize { @dirty = true }
275
276
  end
276
277
 
@@ -34,6 +34,7 @@ module Classifier
34
34
  # @rbs @fitted: bool
35
35
  # @rbs @dirty: bool
36
36
  # @rbs @storage: Storage::Base?
37
+ # @rbs @min_word_length: Integer
37
38
 
38
39
  attr_accessor :storage
39
40
 
@@ -53,13 +54,16 @@ module Classifier
53
54
  # - regularization: L2 regularization strength (default: 0.01)
54
55
  # - max_iterations: Maximum training iterations (default: 100)
55
56
  # - tolerance: Convergence threshold (default: 1e-4)
57
+ # - min_word_length: Minimum word length filter in tokenization
56
58
  #
57
59
  # @rbs (*String | Symbol | Array[String | Symbol], ?learning_rate: Float, ?regularization: Float,
58
- # ?max_iterations: Integer, ?tolerance: Float) -> void
60
+ # ?max_iterations: Integer, ?tolerance: Float, ?min_word_length: Integer) -> void
61
+ # rubocop:disable Metrics/ParameterLists
59
62
  def initialize(*categories, learning_rate: DEFAULT_LEARNING_RATE,
60
63
  regularization: DEFAULT_REGULARIZATION,
61
64
  max_iterations: DEFAULT_MAX_ITERATIONS,
62
- tolerance: DEFAULT_TOLERANCE)
65
+ tolerance: DEFAULT_TOLERANCE,
66
+ min_word_length: Classifier.config.min_word_length)
63
67
  super()
64
68
  categories = categories.flatten
65
69
  @categories = categories.map { |c| c.to_s.prepare_category_name }
@@ -74,7 +78,9 @@ module Classifier
74
78
  @fitted = false
75
79
  @dirty = false
76
80
  @storage = nil
81
+ @min_word_length = min_word_length
77
82
  end
83
+ # rubocop:enable Metrics/ParameterLists
78
84
 
79
85
  # Trains the classifier with text for a category.
80
86
  #
@@ -130,7 +136,7 @@ module Classifier
130
136
  def probabilities(text)
131
137
  raise NotFittedError, 'Model not fitted. Call fit() after training.' unless @fitted
132
138
 
133
- features = text.word_hash
139
+ features = text.word_hash(@min_word_length)
134
140
  synchronize do
135
141
  softmax(compute_scores(features))
136
142
  end
@@ -143,7 +149,7 @@ module Classifier
143
149
  def classifications(text)
144
150
  raise NotFittedError, 'Model not fitted. Call fit() after training.' unless @fitted
145
151
 
146
- features = text.word_hash
152
+ features = text.word_hash(@min_word_length)
147
153
  synchronize do
148
154
  compute_scores(features).transform_keys(&:to_s)
149
155
  end
@@ -239,7 +245,8 @@ module Classifier
239
245
  regularization: @regularization,
240
246
  max_iterations: @max_iterations,
241
247
  tolerance: @tolerance,
242
- fitted: @fitted
248
+ fitted: @fitted,
249
+ min_word_length: @min_word_length
243
250
  }
244
251
  end
245
252
 
@@ -336,7 +343,7 @@ module Classifier
336
343
  def marshal_dump
337
344
  fit unless @fitted
338
345
  [@categories, @weights, @bias, @vocabulary, @learning_rate, @regularization,
339
- @max_iterations, @tolerance, @fitted]
346
+ @max_iterations, @tolerance, @fitted, @min_word_length]
340
347
  end
341
348
 
342
349
  # Custom marshal deserialization to recreate mutex.
@@ -345,7 +352,7 @@ module Classifier
345
352
  def marshal_load(data)
346
353
  mu_initialize
347
354
  @categories, @weights, @bias, @vocabulary, @learning_rate, @regularization,
348
- @max_iterations, @tolerance, @fitted = data
355
+ @max_iterations, @tolerance, @fitted, @min_word_length = data
349
356
  @training_data = []
350
357
  @dirty = false
351
358
  @storage = nil
@@ -383,28 +390,14 @@ module Classifier
383
390
  # end
384
391
  # classifier.fit
385
392
  #
386
- # @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
387
- def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
388
- category = category.to_s.prepare_category_name
389
- raise StandardError, "No such category: #{category}" unless @categories.include?(category)
390
-
391
- reader = Streaming::LineReader.new(io, batch_size: batch_size)
392
- total = reader.estimate_line_count
393
- progress = Streaming::Progress.new(total: total)
393
+ # @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
394
+ def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
395
+ raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
396
+ raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
394
397
 
395
- reader.each_batch do |batch|
396
- synchronize do
397
- batch.each do |text|
398
- features = text.word_hash
399
- features.each_key { |word| @vocabulary[word] = true }
400
- @training_data << { category: category, features: features }
401
- end
402
- @fitted = false
403
- @dirty = true
404
- end
405
- progress.completed += batch.size
406
- progress.current_batch += 1
407
- yield progress if block_given?
398
+ pairs = category && io ? { category => io } : categories
399
+ pairs.each do |cat, stream|
400
+ stream_train_category(cat, stream, batch_size:, &)
408
401
  end
409
402
  end
410
403
 
@@ -433,6 +426,33 @@ module Classifier
433
426
 
434
427
  private
435
428
 
429
+ # Trains from an IO stream with a single category.
430
+ # @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
431
+ def stream_train_category(category, io, batch_size:)
432
+ category = category.to_s.prepare_category_name
433
+ raise ArgumentError, "No such category: #{category}" unless @categories.include?(category)
434
+ raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
435
+
436
+ reader = Streaming::LineReader.new(io, batch_size: batch_size)
437
+ total = reader.estimate_line_count
438
+ progress = Streaming::Progress.new(total: total)
439
+
440
+ reader.each_batch do |batch|
441
+ synchronize do
442
+ batch.each do |text|
443
+ features = text.word_hash(@min_word_length)
444
+ features.each_key { |word| @vocabulary[word] = true }
445
+ @training_data << { category: category, features: features }
446
+ end
447
+ @fitted = false
448
+ @dirty = true
449
+ end
450
+ progress.completed += batch.size
451
+ progress.current_batch += 1
452
+ yield progress if block_given?
453
+ end
454
+ end
455
+
436
456
  # Trains a batch of documents for a single category.
437
457
  # @rbs (String | Symbol, Array[String], ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
438
458
  def train_batch_for_category(category, documents, batch_size: Streaming::DEFAULT_BATCH_SIZE)
@@ -444,7 +464,7 @@ module Classifier
444
464
  documents.each_slice(batch_size) do |batch|
445
465
  synchronize do
446
466
  batch.each do |text|
447
- features = text.word_hash
467
+ features = text.word_hash(@min_word_length)
448
468
  features.each_key { |word| @vocabulary[word] = true }
449
469
  @training_data << { category: category, features: features }
450
470
  end
@@ -463,7 +483,7 @@ module Classifier
463
483
  category = category.to_s.prepare_category_name
464
484
  raise StandardError, "No such category: #{category}" unless @categories.include?(category)
465
485
 
466
- features = text.word_hash
486
+ features = text.word_hash(@min_word_length)
467
487
  synchronize do
468
488
  features.each_key { |word| @vocabulary[word] = true }
469
489
  @training_data << { category: category, features: features }
@@ -570,6 +590,7 @@ module Classifier
570
590
  @fitted = data.fetch('fitted', true)
571
591
  @dirty = false
572
592
  @storage = nil
593
+ @min_word_length = data['min_word_length'] || Classifier.config.min_word_length
573
594
  end
574
595
 
575
596
  def restore_weights_and_bias(data)
@@ -80,6 +80,7 @@ module Classifier
80
80
  # @rbs @u_matrix: Matrix?
81
81
  # @rbs @max_rank: Integer
82
82
  # @rbs @initial_vocab_size: Integer?
83
+ # @rbs @min_word_length: Integer
83
84
 
84
85
  attr_reader :word_list, :singular_values
85
86
  attr_accessor :auto_rebuild, :storage
@@ -110,6 +111,7 @@ module Classifier
110
111
  @max_rank = options[:max_rank] || DEFAULT_MAX_RANK
111
112
  @u_matrix = nil
112
113
  @initial_vocab_size = nil
114
+ @min_word_length = options[:min_word_length] || Classifier.config.min_word_length
113
115
  end
114
116
 
115
117
  # Returns true if the index needs to be rebuilt. The index needs
@@ -216,7 +218,13 @@ module Classifier
216
218
  #
217
219
  # @rbs (String, *String | Symbol) ?{ (String) -> String } -> void
218
220
  def add_item(item, *categories, &block)
219
- clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
221
+ clean_word_hash =
222
+ if block
223
+ block.call(item).clean_word_hash(@min_word_length)
224
+ else
225
+ item.to_s.clean_word_hash(@min_word_length)
226
+ end
227
+
220
228
  node = nil
221
229
 
222
230
  synchronize do
@@ -480,14 +488,15 @@ module Classifier
480
488
  # Custom marshal serialization to exclude mutex state
481
489
  # @rbs () -> Array[untyped]
482
490
  def marshal_dump
483
- [@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty]
491
+ [@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty, @min_word_length]
484
492
  end
485
493
 
486
494
  # Custom marshal deserialization to recreate mutex
487
495
  # @rbs (Array[untyped]) -> void
488
496
  def marshal_load(data)
489
497
  mu_initialize
490
- @auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty = data
498
+ @auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty,
499
+ @min_word_length = data
491
500
  @storage = nil
492
501
  end
493
502
 
@@ -653,21 +662,22 @@ module Classifier
653
662
  # puts "#{progress.completed} documents processed"
654
663
  # end
655
664
  #
656
- # @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
657
- def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
658
- original_auto_rebuild = @auto_rebuild
659
- @auto_rebuild = false
660
-
665
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
666
+ # @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
667
+ def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
668
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
669
+ raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
670
+ raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
671
+
672
+ pairs = category && io ? { category => io } : categories
673
+ pairs.each_value do |io|
674
+ raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
675
+ end
661
676
  begin
662
- reader = Streaming::LineReader.new(io, batch_size: batch_size)
663
- total = reader.estimate_line_count
664
- progress = Streaming::Progress.new(total: total)
665
-
666
- reader.each_batch do |batch|
667
- batch.each { |text| add_item(text, category) }
668
- progress.completed += batch.size
669
- progress.current_batch += 1
670
- yield progress if block_given?
677
+ original_auto_rebuild = @auto_rebuild
678
+ @auto_rebuild = false
679
+ pairs.each do |cat, stream|
680
+ stream_train_category(cat, stream, batch_size:, &)
671
681
  end
672
682
  ensure
673
683
  @auto_rebuild = original_auto_rebuild
@@ -720,6 +730,21 @@ module Classifier
720
730
 
721
731
  private
722
732
 
733
+ # Trains from an IO stream with a single category.
734
+ # @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
735
+ def stream_train_category(category, io, batch_size:)
736
+ reader = Streaming::LineReader.new(io, batch_size: batch_size)
737
+ total = reader.estimate_line_count
738
+ progress = Streaming::Progress.new(total: total)
739
+
740
+ reader.each_batch do |batch|
741
+ batch.each { |text| add_item(text, category) }
742
+ progress.completed += batch.size
743
+ progress.current_batch += 1
744
+ yield progress if block_given?
745
+ end
746
+ end
747
+
723
748
  # Restores LSI state from a JSON string (used by reload)
724
749
  # @rbs (String) -> void
725
750
  def restore_from_json(json)
@@ -26,8 +26,8 @@ module Classifier
26
26
  # Trains the classifier from an IO stream.
27
27
  # Each line in the stream is treated as a separate document.
28
28
  #
29
- # @rbs (Symbol | String, IO, ?batch_size: Integer) { (Progress) -> void } -> void
30
- def train_from_stream(category, io, batch_size: DEFAULT_BATCH_SIZE, &block)
29
+ # @rbs (?(Symbol | String | nil), ?IO?, ?batch_size: Integer, **IO) { (Progress) -> void } -> void
30
+ def train_from_stream(category = nil, io = nil, batch_size: DEFAULT_BATCH_SIZE, **categories, &block)
31
31
  raise NotImplementedError, "#{self.class} must implement train_from_stream"
32
32
  end
33
33
 
@@ -28,6 +28,7 @@ module Classifier
28
28
  # @rbs @fitted: bool
29
29
  # @rbs @dirty: bool
30
30
  # @rbs @storage: Storage::Base?
31
+ # @rbs @min_word_length: Integer
31
32
 
32
33
  attr_reader :vocabulary, :idf, :num_documents
33
34
  attr_accessor :storage
@@ -36,10 +37,12 @@ module Classifier
36
37
  # - min_df/max_df: filter terms by document frequency (Integer for count, Float for proportion)
37
38
  # - ngram_range: [1,1] for unigrams, [1,2] for unigrams+bigrams
38
39
  # - sublinear_tf: use 1 + log(tf) instead of raw term frequency
40
+ # - min_word_length: minimum word length filter in tokenization
39
41
  #
40
42
  # @rbs (?min_df: Integer | Float, ?max_df: Integer | Float,
41
- # ?ngram_range: Array[Integer], ?sublinear_tf: bool) -> void
42
- def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false)
43
+ # ?ngram_range: Array[Integer], ?sublinear_tf: bool, ?min_word_length: Integer) -> void
44
+ def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false,
45
+ min_word_length: Classifier.config.min_word_length)
43
46
  validate_df!(min_df, 'min_df')
44
47
  validate_df!(max_df, 'max_df')
45
48
  validate_ngram_range!(ngram_range)
@@ -54,6 +57,7 @@ module Classifier
54
57
  @fitted = false
55
58
  @dirty = false
56
59
  @storage = nil
60
+ @min_word_length = min_word_length
57
61
  end
58
62
 
59
63
  # Learns vocabulary and IDF weights from the corpus.
@@ -204,7 +208,8 @@ module Classifier
204
208
  vocabulary: @vocabulary,
205
209
  idf: @idf,
206
210
  num_documents: @num_documents,
207
- fitted: @fitted
211
+ fitted: @fitted,
212
+ min_word_length: @min_word_length
208
213
  }
209
214
  end
210
215
 
@@ -223,7 +228,8 @@ module Classifier
223
228
  min_df: data['min_df'],
224
229
  max_df: data['max_df'],
225
230
  ngram_range: data['ngram_range'],
226
- sublinear_tf: data['sublinear_tf']
231
+ sublinear_tf: data['sublinear_tf'],
232
+ min_word_length: data['min_word_length'] || Classifier.config.min_word_length
227
233
  )
228
234
 
229
235
  instance.instance_variable_set(:@vocabulary, symbolize_keys(data['vocabulary']))
@@ -238,12 +244,14 @@ module Classifier
238
244
 
239
245
  # @rbs () -> Array[untyped]
240
246
  def marshal_dump
241
- [@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted]
247
+ [@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted,
248
+ @min_word_length]
242
249
  end
243
250
 
244
251
  # @rbs (Array[untyped]) -> void
245
252
  def marshal_load(data)
246
- @min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted = data
253
+ @min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted,
254
+ @min_word_length = data
247
255
  @dirty = false
248
256
  @storage = nil
249
257
  end
@@ -334,7 +342,7 @@ module Classifier
334
342
  result = Hash.new(0)
335
343
 
336
344
  if @ngram_range[0] <= 1
337
- word_hash = document.clean_word_hash
345
+ word_hash = document.clean_word_hash(@min_word_length)
338
346
  word_hash.each { |term, count| result[term] += count }
339
347
  end
340
348
 
@@ -1,3 +1,3 @@
1
1
  module Classifier
2
- VERSION = '2.3.2'.freeze
2
+ VERSION = '2.5.0'.freeze
3
3
  end
data/lib/classifier.rb CHANGED
@@ -36,3 +36,4 @@ require 'classifier/lsi'
36
36
  require 'classifier/knn'
37
37
  require 'classifier/tfidf'
38
38
  require 'classifier/logistic_regression'
39
+ require 'classifier/config'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.2
4
+ version: 2.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lucas Carlson
@@ -162,6 +162,7 @@ files:
162
162
  - lib/classifier.rb
163
163
  - lib/classifier/bayes.rb
164
164
  - lib/classifier/cli.rb
165
+ - lib/classifier/config.rb
165
166
  - lib/classifier/errors.rb
166
167
  - lib/classifier/extensions/string.rb
167
168
  - lib/classifier/extensions/vector.rb
@@ -213,7 +214,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
213
214
  - !ruby/object:Gem::Version
214
215
  version: '0'
215
216
  requirements: []
216
- rubygems_version: 4.0.3
217
+ rubygems_version: 4.0.10
217
218
  specification_version: 4
218
219
  summary: Text classification with Bayesian, LSI, Logistic Regression, kNN, and TF-IDF
219
220
  vectorization.