classifier 2.4.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c30fac948021b0009e53c7c4a232ac3e2472707fd7fb476cbb3f36e4912af399
4
- data.tar.gz: 7120fa872d6ae6b49a8117c6dd672f2ee63d8b62384b348c7db8ca8cf790f88a
3
+ metadata.gz: 539130382c7ce45072e515ee9817f47fb1144af18498a81c27c7dda842477141
4
+ data.tar.gz: 1537f2c7c164ec70c14c7b314148fa81cfb947a93dfe328125082567c5a99a7e
5
5
  SHA512:
6
- metadata.gz: e9250266207fe481dfec4d09fc8d30a97e591649edcea4d0a975a5a59c379819341fa46b219ee3ecc218cbb1c156fd708bb288478b322dde81333000abecf43f
7
- data.tar.gz: 7dbf662acb9b5819dd77224690668654211de7abe2c0b009fc059696880479401bd0e2e503f002b6862da955bf794990b00784856751438898c999b2f9cc49f6
6
+ metadata.gz: 3bf1385063c020bb08097192417232a6dc3fce7a16a057bec27ccf62253d69809fedc692f597b7ab6c4e36e8236629188e37add798d883ed670a55cb02c37023
7
+ data.tar.gz: '0528f07ae03ab30b752c3242b747058956831900c55c1c5a858ab623525075806a52fc8b66236de0f4b959ce678f9a24204887434b19c8662860d06173b2634f'
@@ -328,20 +328,14 @@ module Classifier
328
328
  # puts "#{progress.completed} documents processed"
329
329
  # end
330
330
  #
331
- # @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
332
- def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
333
- category = category.prepare_category_name
334
- raise StandardError, "No such category: #{category}" unless @categories.key?(category)
335
-
336
- reader = Streaming::LineReader.new(io, batch_size: batch_size)
337
- total = reader.estimate_line_count
338
- progress = Streaming::Progress.new(total: total)
339
-
340
- reader.each_batch do |batch|
341
- train_batch_internal(category, batch)
342
- progress.completed += batch.size
343
- progress.current_batch += 1
344
- yield progress if block_given?
331
+ # @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
332
+ def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
333
+ raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
334
+ raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
335
+
336
+ pairs = category && io ? { category => io } : categories
337
+ pairs.each do |cat, stream|
338
+ stream_train_category(cat, stream, batch_size: batch_size, &)
345
339
  end
346
340
  end
347
341
 
@@ -389,6 +383,25 @@ module Classifier
389
383
 
390
384
  private
391
385
 
386
+ # Trains from an IO stream with a single category.
387
+ # @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
388
+ def stream_train_category(category, io, batch_size:)
389
+ category = category.prepare_category_name
390
+ raise ArgumentError, "No such category: #{category}" unless @categories.key?(category)
391
+ raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
392
+
393
+ reader = Streaming::LineReader.new(io, batch_size: batch_size)
394
+ total = reader.estimate_line_count
395
+ progress = Streaming::Progress.new(total: total)
396
+
397
+ reader.each_batch do |batch|
398
+ train_batch_internal(category, batch)
399
+ progress.completed += batch.size
400
+ progress.current_batch += 1
401
+ yield progress if block_given?
402
+ end
403
+ end
404
+
392
405
  # Trains a batch of documents for a single category.
393
406
  # @rbs (String | Symbol, Array[String], ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
394
407
  def train_batch_for_category(category, documents, batch_size: Streaming::DEFAULT_BATCH_SIZE)
@@ -268,9 +268,10 @@ module Classifier
268
268
  # puts "#{progress.completed} documents processed"
269
269
  # end
270
270
  #
271
- # @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
272
- def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE, &block)
273
- @lsi.train_from_stream(category, io, batch_size: batch_size, &block)
271
+ # @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
272
+ def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
273
+ # @type var categories: untype
274
+ @lsi.train_from_stream(category, io, batch_size: batch_size, **categories, &)
274
275
  synchronize { @dirty = true }
275
276
  end
276
277
 
@@ -390,28 +390,14 @@ module Classifier
390
390
  # end
391
391
  # classifier.fit
392
392
  #
393
- # @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
394
- def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
395
- category = category.to_s.prepare_category_name
396
- raise StandardError, "No such category: #{category}" unless @categories.include?(category)
397
-
398
- reader = Streaming::LineReader.new(io, batch_size: batch_size)
399
- total = reader.estimate_line_count
400
- progress = Streaming::Progress.new(total: total)
401
-
402
- reader.each_batch do |batch|
403
- synchronize do
404
- batch.each do |text|
405
- features = text.word_hash(@min_word_length)
406
- features.each_key { |word| @vocabulary[word] = true }
407
- @training_data << { category: category, features: features }
408
- end
409
- @fitted = false
410
- @dirty = true
411
- end
412
- progress.completed += batch.size
413
- progress.current_batch += 1
414
- yield progress if block_given?
393
+ # @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
394
+ def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
395
+ raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
396
+ raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
397
+
398
+ pairs = category && io ? { category => io } : categories
399
+ pairs.each do |cat, stream|
400
+ stream_train_category(cat, stream, batch_size:, &)
415
401
  end
416
402
  end
417
403
 
@@ -440,6 +426,33 @@ module Classifier
440
426
 
441
427
  private
442
428
 
429
+ # Trains from an IO stream with a single category.
430
+ # @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
431
+ def stream_train_category(category, io, batch_size:)
432
+ category = category.to_s.prepare_category_name
433
+ raise ArgumentError, "No such category: #{category}" unless @categories.include?(category)
434
+ raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
435
+
436
+ reader = Streaming::LineReader.new(io, batch_size: batch_size)
437
+ total = reader.estimate_line_count
438
+ progress = Streaming::Progress.new(total: total)
439
+
440
+ reader.each_batch do |batch|
441
+ synchronize do
442
+ batch.each do |text|
443
+ features = text.word_hash(@min_word_length)
444
+ features.each_key { |word| @vocabulary[word] = true }
445
+ @training_data << { category: category, features: features }
446
+ end
447
+ @fitted = false
448
+ @dirty = true
449
+ end
450
+ progress.completed += batch.size
451
+ progress.current_batch += 1
452
+ yield progress if block_given?
453
+ end
454
+ end
455
+
443
456
  # Trains a batch of documents for a single category.
444
457
  # @rbs (String | Symbol, Array[String], ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
445
458
  def train_batch_for_category(category, documents, batch_size: Streaming::DEFAULT_BATCH_SIZE)
@@ -662,21 +662,22 @@ module Classifier
662
662
  # puts "#{progress.completed} documents processed"
663
663
  # end
664
664
  #
665
- # @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
666
- def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
667
- original_auto_rebuild = @auto_rebuild
668
- @auto_rebuild = false
669
-
665
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
666
+ # @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
667
+ def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
668
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
669
+ raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
670
+ raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
671
+
672
+ pairs = category && io ? { category => io } : categories
673
+ pairs.each_value do |io|
674
+ raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
675
+ end
670
676
  begin
671
- reader = Streaming::LineReader.new(io, batch_size: batch_size)
672
- total = reader.estimate_line_count
673
- progress = Streaming::Progress.new(total: total)
674
-
675
- reader.each_batch do |batch|
676
- batch.each { |text| add_item(text, category) }
677
- progress.completed += batch.size
678
- progress.current_batch += 1
679
- yield progress if block_given?
677
+ original_auto_rebuild = @auto_rebuild
678
+ @auto_rebuild = false
679
+ pairs.each do |cat, stream|
680
+ stream_train_category(cat, stream, batch_size:, &)
680
681
  end
681
682
  ensure
682
683
  @auto_rebuild = original_auto_rebuild
@@ -729,6 +730,21 @@ module Classifier
729
730
 
730
731
  private
731
732
 
733
+ # Trains from an IO stream with a single category.
734
+ # @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
735
+ def stream_train_category(category, io, batch_size:)
736
+ reader = Streaming::LineReader.new(io, batch_size: batch_size)
737
+ total = reader.estimate_line_count
738
+ progress = Streaming::Progress.new(total: total)
739
+
740
+ reader.each_batch do |batch|
741
+ batch.each { |text| add_item(text, category) }
742
+ progress.completed += batch.size
743
+ progress.current_batch += 1
744
+ yield progress if block_given?
745
+ end
746
+ end
747
+
732
748
  # Restores LSI state from a JSON string (used by reload)
733
749
  # @rbs (String) -> void
734
750
  def restore_from_json(json)
@@ -26,8 +26,8 @@ module Classifier
26
26
  # Trains the classifier from an IO stream.
27
27
  # Each line in the stream is treated as a separate document.
28
28
  #
29
- # @rbs (Symbol | String, IO, ?batch_size: Integer) { (Progress) -> void } -> void
30
- def train_from_stream(category, io, batch_size: DEFAULT_BATCH_SIZE, &block)
29
+ # @rbs (?(Symbol | String | nil), ?IO?, ?batch_size: Integer, **IO) { (Progress) -> void } -> void
30
+ def train_from_stream(category = nil, io = nil, batch_size: DEFAULT_BATCH_SIZE, **categories, &block)
31
31
  raise NotImplementedError, "#{self.class} must implement train_from_stream"
32
32
  end
33
33
 
@@ -1,3 +1,3 @@
1
1
  module Classifier
2
- VERSION = '2.4.0'.freeze
2
+ VERSION = '2.5.0'.freeze
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.4.0
4
+ version: 2.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lucas Carlson