classifier 2.4.0 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/classifier/bayes.rb +27 -14
- data/lib/classifier/knn.rb +4 -3
- data/lib/classifier/logistic_regression.rb +35 -22
- data/lib/classifier/lsi.rb +30 -14
- data/lib/classifier/streaming.rb +2 -2
- data/lib/classifier/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 539130382c7ce45072e515ee9817f47fb1144af18498a81c27c7dda842477141
|
|
4
|
+
data.tar.gz: 1537f2c7c164ec70c14c7b314148fa81cfb947a93dfe328125082567c5a99a7e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3bf1385063c020bb08097192417232a6dc3fce7a16a057bec27ccf62253d69809fedc692f597b7ab6c4e36e8236629188e37add798d883ed670a55cb02c37023
|
|
7
|
+
data.tar.gz: '0528f07ae03ab30b752c3242b747058956831900c55c1c5a858ab623525075806a52fc8b66236de0f4b959ce678f9a24204887434b19c8662860d06173b2634f'
|
data/lib/classifier/bayes.rb
CHANGED
|
@@ -328,20 +328,14 @@ module Classifier
|
|
|
328
328
|
# puts "#{progress.completed} documents processed"
|
|
329
329
|
# end
|
|
330
330
|
#
|
|
331
|
-
# @rbs (String | Symbol, IO
|
|
332
|
-
def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
|
|
333
|
-
category
|
|
334
|
-
raise
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
reader.each_batch do |batch|
|
|
341
|
-
train_batch_internal(category, batch)
|
|
342
|
-
progress.completed += batch.size
|
|
343
|
-
progress.current_batch += 1
|
|
344
|
-
yield progress if block_given?
|
|
331
|
+
# @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
|
|
332
|
+
def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
|
|
333
|
+
raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
|
|
334
|
+
raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
|
|
335
|
+
|
|
336
|
+
pairs = category && io ? { category => io } : categories
|
|
337
|
+
pairs.each do |cat, stream|
|
|
338
|
+
stream_train_category(cat, stream, batch_size: batch_size, &)
|
|
345
339
|
end
|
|
346
340
|
end
|
|
347
341
|
|
|
@@ -389,6 +383,25 @@ module Classifier
|
|
|
389
383
|
|
|
390
384
|
private
|
|
391
385
|
|
|
386
|
+
# Trains from an IO stream with a single category.
|
|
387
|
+
# @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
|
|
388
|
+
def stream_train_category(category, io, batch_size:)
|
|
389
|
+
category = category.prepare_category_name
|
|
390
|
+
raise ArgumentError, "No such category: #{category}" unless @categories.key?(category)
|
|
391
|
+
raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
|
|
392
|
+
|
|
393
|
+
reader = Streaming::LineReader.new(io, batch_size: batch_size)
|
|
394
|
+
total = reader.estimate_line_count
|
|
395
|
+
progress = Streaming::Progress.new(total: total)
|
|
396
|
+
|
|
397
|
+
reader.each_batch do |batch|
|
|
398
|
+
train_batch_internal(category, batch)
|
|
399
|
+
progress.completed += batch.size
|
|
400
|
+
progress.current_batch += 1
|
|
401
|
+
yield progress if block_given?
|
|
402
|
+
end
|
|
403
|
+
end
|
|
404
|
+
|
|
392
405
|
# Trains a batch of documents for a single category.
|
|
393
406
|
# @rbs (String | Symbol, Array[String], ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
|
|
394
407
|
def train_batch_for_category(category, documents, batch_size: Streaming::DEFAULT_BATCH_SIZE)
|
data/lib/classifier/knn.rb
CHANGED
|
@@ -268,9 +268,10 @@ module Classifier
|
|
|
268
268
|
# puts "#{progress.completed} documents processed"
|
|
269
269
|
# end
|
|
270
270
|
#
|
|
271
|
-
# @rbs (String | Symbol, IO
|
|
272
|
-
def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE, &
|
|
273
|
-
@
|
|
271
|
+
# @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
|
|
272
|
+
def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
|
|
273
|
+
# @type var categories: untype
|
|
274
|
+
@lsi.train_from_stream(category, io, batch_size: batch_size, **categories, &)
|
|
274
275
|
synchronize { @dirty = true }
|
|
275
276
|
end
|
|
276
277
|
|
|
@@ -390,28 +390,14 @@ module Classifier
|
|
|
390
390
|
# end
|
|
391
391
|
# classifier.fit
|
|
392
392
|
#
|
|
393
|
-
# @rbs (String | Symbol, IO
|
|
394
|
-
def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
|
|
395
|
-
category
|
|
396
|
-
raise
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
reader.each_batch do |batch|
|
|
403
|
-
synchronize do
|
|
404
|
-
batch.each do |text|
|
|
405
|
-
features = text.word_hash(@min_word_length)
|
|
406
|
-
features.each_key { |word| @vocabulary[word] = true }
|
|
407
|
-
@training_data << { category: category, features: features }
|
|
408
|
-
end
|
|
409
|
-
@fitted = false
|
|
410
|
-
@dirty = true
|
|
411
|
-
end
|
|
412
|
-
progress.completed += batch.size
|
|
413
|
-
progress.current_batch += 1
|
|
414
|
-
yield progress if block_given?
|
|
393
|
+
# @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
|
|
394
|
+
def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
|
|
395
|
+
raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
|
|
396
|
+
raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
|
|
397
|
+
|
|
398
|
+
pairs = category && io ? { category => io } : categories
|
|
399
|
+
pairs.each do |cat, stream|
|
|
400
|
+
stream_train_category(cat, stream, batch_size:, &)
|
|
415
401
|
end
|
|
416
402
|
end
|
|
417
403
|
|
|
@@ -440,6 +426,33 @@ module Classifier
|
|
|
440
426
|
|
|
441
427
|
private
|
|
442
428
|
|
|
429
|
+
# Trains from an IO stream with a single category.
|
|
430
|
+
# @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
|
|
431
|
+
def stream_train_category(category, io, batch_size:)
|
|
432
|
+
category = category.to_s.prepare_category_name
|
|
433
|
+
raise ArgumentError, "No such category: #{category}" unless @categories.include?(category)
|
|
434
|
+
raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
|
|
435
|
+
|
|
436
|
+
reader = Streaming::LineReader.new(io, batch_size: batch_size)
|
|
437
|
+
total = reader.estimate_line_count
|
|
438
|
+
progress = Streaming::Progress.new(total: total)
|
|
439
|
+
|
|
440
|
+
reader.each_batch do |batch|
|
|
441
|
+
synchronize do
|
|
442
|
+
batch.each do |text|
|
|
443
|
+
features = text.word_hash(@min_word_length)
|
|
444
|
+
features.each_key { |word| @vocabulary[word] = true }
|
|
445
|
+
@training_data << { category: category, features: features }
|
|
446
|
+
end
|
|
447
|
+
@fitted = false
|
|
448
|
+
@dirty = true
|
|
449
|
+
end
|
|
450
|
+
progress.completed += batch.size
|
|
451
|
+
progress.current_batch += 1
|
|
452
|
+
yield progress if block_given?
|
|
453
|
+
end
|
|
454
|
+
end
|
|
455
|
+
|
|
443
456
|
# Trains a batch of documents for a single category.
|
|
444
457
|
# @rbs (String | Symbol, Array[String], ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
|
|
445
458
|
def train_batch_for_category(category, documents, batch_size: Streaming::DEFAULT_BATCH_SIZE)
|
data/lib/classifier/lsi.rb
CHANGED
|
@@ -662,21 +662,22 @@ module Classifier
|
|
|
662
662
|
# puts "#{progress.completed} documents processed"
|
|
663
663
|
# end
|
|
664
664
|
#
|
|
665
|
-
#
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
665
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
666
|
+
# @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
|
|
667
|
+
def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
|
|
668
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
669
|
+
raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
|
|
670
|
+
raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
|
|
671
|
+
|
|
672
|
+
pairs = category && io ? { category => io } : categories
|
|
673
|
+
pairs.each_value do |io|
|
|
674
|
+
raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
|
|
675
|
+
end
|
|
670
676
|
begin
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
reader.each_batch do |batch|
|
|
676
|
-
batch.each { |text| add_item(text, category) }
|
|
677
|
-
progress.completed += batch.size
|
|
678
|
-
progress.current_batch += 1
|
|
679
|
-
yield progress if block_given?
|
|
677
|
+
original_auto_rebuild = @auto_rebuild
|
|
678
|
+
@auto_rebuild = false
|
|
679
|
+
pairs.each do |cat, stream|
|
|
680
|
+
stream_train_category(cat, stream, batch_size:, &)
|
|
680
681
|
end
|
|
681
682
|
ensure
|
|
682
683
|
@auto_rebuild = original_auto_rebuild
|
|
@@ -729,6 +730,21 @@ module Classifier
|
|
|
729
730
|
|
|
730
731
|
private
|
|
731
732
|
|
|
733
|
+
# Trains from an IO stream with a single category.
|
|
734
|
+
# @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
|
|
735
|
+
def stream_train_category(category, io, batch_size:)
|
|
736
|
+
reader = Streaming::LineReader.new(io, batch_size: batch_size)
|
|
737
|
+
total = reader.estimate_line_count
|
|
738
|
+
progress = Streaming::Progress.new(total: total)
|
|
739
|
+
|
|
740
|
+
reader.each_batch do |batch|
|
|
741
|
+
batch.each { |text| add_item(text, category) }
|
|
742
|
+
progress.completed += batch.size
|
|
743
|
+
progress.current_batch += 1
|
|
744
|
+
yield progress if block_given?
|
|
745
|
+
end
|
|
746
|
+
end
|
|
747
|
+
|
|
732
748
|
# Restores LSI state from a JSON string (used by reload)
|
|
733
749
|
# @rbs (String) -> void
|
|
734
750
|
def restore_from_json(json)
|
data/lib/classifier/streaming.rb
CHANGED
|
@@ -26,8 +26,8 @@ module Classifier
|
|
|
26
26
|
# Trains the classifier from an IO stream.
|
|
27
27
|
# Each line in the stream is treated as a separate document.
|
|
28
28
|
#
|
|
29
|
-
# @rbs (Symbol | String, IO
|
|
30
|
-
def train_from_stream(category, io, batch_size: DEFAULT_BATCH_SIZE, &block)
|
|
29
|
+
# @rbs (?(Symbol | String | nil), ?IO?, ?batch_size: Integer, **IO) { (Progress) -> void } -> void
|
|
30
|
+
def train_from_stream(category = nil, io = nil, batch_size: DEFAULT_BATCH_SIZE, **categories, &block)
|
|
31
31
|
raise NotImplementedError, "#{self.class} must implement train_from_stream"
|
|
32
32
|
end
|
|
33
33
|
|
data/lib/classifier/version.rb
CHANGED