classifier 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +66 -199
- data/ext/classifier/classifier_ext.c +1 -0
- data/ext/classifier/incremental_svd.c +393 -0
- data/ext/classifier/linalg.h +8 -0
- data/lib/classifier/bayes.rb +177 -53
- data/lib/classifier/errors.rb +3 -0
- data/lib/classifier/knn.rb +351 -0
- data/lib/classifier/logistic_regression.rb +571 -0
- data/lib/classifier/lsi/incremental_svd.rb +166 -0
- data/lib/classifier/lsi/summary.rb +25 -5
- data/lib/classifier/lsi.rb +365 -17
- data/lib/classifier/streaming/line_reader.rb +99 -0
- data/lib/classifier/streaming/progress.rb +96 -0
- data/lib/classifier/streaming.rb +122 -0
- data/lib/classifier/tfidf.rb +408 -0
- data/lib/classifier.rb +4 -0
- data/sig/vendor/matrix.rbs +25 -14
- data/sig/vendor/streaming.rbs +14 -0
- metadata +17 -4
data/lib/classifier/lsi.rb
CHANGED
|
@@ -58,6 +58,7 @@ require 'mutex_m'
|
|
|
58
58
|
require 'classifier/lsi/word_list'
|
|
59
59
|
require 'classifier/lsi/content_node'
|
|
60
60
|
require 'classifier/lsi/summary'
|
|
61
|
+
require 'classifier/lsi/incremental_svd'
|
|
61
62
|
|
|
62
63
|
module Classifier
|
|
63
64
|
# This class implements a Latent Semantic Indexer, which can search, classify and cluster
|
|
@@ -65,6 +66,7 @@ module Classifier
|
|
|
65
66
|
# please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
|
|
66
67
|
class LSI
|
|
67
68
|
include Mutex_m
|
|
69
|
+
include Streaming
|
|
68
70
|
|
|
69
71
|
# @rbs @auto_rebuild: bool
|
|
70
72
|
# @rbs @word_list: WordList
|
|
@@ -74,14 +76,24 @@ module Classifier
|
|
|
74
76
|
# @rbs @singular_values: Array[Float]?
|
|
75
77
|
# @rbs @dirty: bool
|
|
76
78
|
# @rbs @storage: Storage::Base?
|
|
79
|
+
# @rbs @incremental_mode: bool
|
|
80
|
+
# @rbs @u_matrix: Matrix?
|
|
81
|
+
# @rbs @max_rank: Integer
|
|
82
|
+
# @rbs @initial_vocab_size: Integer?
|
|
77
83
|
|
|
78
84
|
attr_reader :word_list, :singular_values
|
|
79
85
|
attr_accessor :auto_rebuild, :storage
|
|
80
86
|
|
|
87
|
+
# Default maximum rank for incremental SVD
|
|
88
|
+
DEFAULT_MAX_RANK = 100
|
|
89
|
+
|
|
81
90
|
# Create a fresh index.
|
|
82
91
|
# If you want to call #build_index manually, use
|
|
83
92
|
# Classifier::LSI.new auto_rebuild: false
|
|
84
93
|
#
|
|
94
|
+
# For incremental SVD mode (adds documents without full rebuild):
|
|
95
|
+
# Classifier::LSI.new incremental: true, max_rank: 100
|
|
96
|
+
#
|
|
85
97
|
# @rbs (?Hash[Symbol, untyped]) -> void
|
|
86
98
|
def initialize(options = {})
|
|
87
99
|
super()
|
|
@@ -92,6 +104,12 @@ module Classifier
|
|
|
92
104
|
@built_at_version = -1
|
|
93
105
|
@dirty = false
|
|
94
106
|
@storage = nil
|
|
107
|
+
|
|
108
|
+
# Incremental SVD settings
|
|
109
|
+
@incremental_mode = options[:incremental] == true
|
|
110
|
+
@max_rank = options[:max_rank] || DEFAULT_MAX_RANK
|
|
111
|
+
@u_matrix = nil
|
|
112
|
+
@initial_vocab_size = nil
|
|
95
113
|
end
|
|
96
114
|
|
|
97
115
|
# Returns true if the index needs to be rebuilt. The index needs
|
|
@@ -122,12 +140,73 @@ module Classifier
|
|
|
122
140
|
end
|
|
123
141
|
end
|
|
124
142
|
|
|
143
|
+
# Returns true if incremental mode is enabled and active.
|
|
144
|
+
# Incremental mode becomes active after the first build_index call.
|
|
145
|
+
#
|
|
146
|
+
# @rbs () -> bool
|
|
147
|
+
def incremental_enabled?
|
|
148
|
+
@incremental_mode && !@u_matrix.nil?
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Returns the current rank of the incremental SVD (number of singular values kept).
|
|
152
|
+
# Returns nil if incremental mode is not active.
|
|
153
|
+
#
|
|
154
|
+
# @rbs () -> Integer?
|
|
155
|
+
def current_rank
|
|
156
|
+
@singular_values&.count(&:positive?)
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Disables incremental mode. Subsequent adds will trigger full rebuilds.
|
|
160
|
+
#
|
|
161
|
+
# @rbs () -> void
|
|
162
|
+
def disable_incremental_mode!
|
|
163
|
+
@incremental_mode = false
|
|
164
|
+
@u_matrix = nil
|
|
165
|
+
@initial_vocab_size = nil
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Enables incremental mode with optional max_rank setting.
|
|
169
|
+
# The next build_index call will store the U matrix for incremental updates.
|
|
170
|
+
#
|
|
171
|
+
# @rbs (?max_rank: Integer) -> void
|
|
172
|
+
def enable_incremental_mode!(max_rank: DEFAULT_MAX_RANK)
|
|
173
|
+
@incremental_mode = true
|
|
174
|
+
@max_rank = max_rank
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# Adds items to the index using hash-style syntax.
|
|
178
|
+
# The hash keys are categories, and values are items (or arrays of items).
|
|
179
|
+
#
|
|
180
|
+
# For example:
|
|
181
|
+
# lsi = Classifier::LSI.new
|
|
182
|
+
# lsi.add("Dog" => "Dogs are loyal pets")
|
|
183
|
+
# lsi.add("Cat" => "Cats are independent")
|
|
184
|
+
# lsi.add(Bird: "Birds can fly") # Symbol keys work too
|
|
185
|
+
#
|
|
186
|
+
# Multiple items with the same category:
|
|
187
|
+
# lsi.add("Dog" => ["Dogs are loyal", "Puppies are cute"])
|
|
188
|
+
#
|
|
189
|
+
# Batch operations with multiple categories:
|
|
190
|
+
# lsi.add(
|
|
191
|
+
# "Dog" => ["Dogs are loyal", "Puppies are cute"],
|
|
192
|
+
# "Cat" => ["Cats are independent", "Kittens are playful"]
|
|
193
|
+
# )
|
|
194
|
+
#
|
|
195
|
+
# @rbs (**untyped items) -> void
|
|
196
|
+
def add(**items)
|
|
197
|
+
items.each do |category, value|
|
|
198
|
+
Array(value).each { |doc| add_item(doc, category.to_s) }
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
125
202
|
# Adds an item to the index. item is assumed to be a string, but
|
|
126
203
|
# any item may be indexed so long as it responds to #to_s or if
|
|
127
204
|
# you provide an optional block explaining how the indexer can
|
|
128
205
|
# fetch fresh string data. This optional block is passed the item,
|
|
129
206
|
# so the item may only be a reference to a URL or file name.
|
|
130
207
|
#
|
|
208
|
+
# @deprecated Use {#add} instead for clearer hash-style syntax.
|
|
209
|
+
#
|
|
131
210
|
# For example:
|
|
132
211
|
# lsi = Classifier::LSI.new
|
|
133
212
|
# lsi.add_item "This is just plain text"
|
|
@@ -138,11 +217,18 @@ module Classifier
|
|
|
138
217
|
# @rbs (String, *String | Symbol) ?{ (String) -> String } -> void
|
|
139
218
|
def add_item(item, *categories, &block)
|
|
140
219
|
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
|
220
|
+
node = nil
|
|
221
|
+
|
|
141
222
|
synchronize do
|
|
142
|
-
|
|
223
|
+
node = ContentNode.new(clean_word_hash, *categories)
|
|
224
|
+
@items[item] = node
|
|
143
225
|
@version += 1
|
|
144
226
|
@dirty = true
|
|
145
227
|
end
|
|
228
|
+
|
|
229
|
+
# Use incremental update if enabled and we have a U matrix
|
|
230
|
+
return perform_incremental_update(node, clean_word_hash) if @incremental_mode && @u_matrix
|
|
231
|
+
|
|
146
232
|
build_index if @auto_rebuild
|
|
147
233
|
end
|
|
148
234
|
|
|
@@ -203,12 +289,12 @@ module Classifier
|
|
|
203
289
|
# A value of 1 for cutoff means that no semantic analysis will take place,
|
|
204
290
|
# turning the LSI class into a simple vector search engine.
|
|
205
291
|
#
|
|
206
|
-
# @rbs (?Float) -> void
|
|
207
|
-
def build_index(cutoff = 0.75)
|
|
292
|
+
# @rbs (?Float, ?force: bool) -> void
|
|
293
|
+
def build_index(cutoff = 0.75, force: false)
|
|
208
294
|
validate_cutoff!(cutoff)
|
|
209
295
|
|
|
210
296
|
synchronize do
|
|
211
|
-
return unless needs_rebuild_unlocked?
|
|
297
|
+
return unless force || needs_rebuild_unlocked?
|
|
212
298
|
|
|
213
299
|
make_word_list
|
|
214
300
|
|
|
@@ -219,14 +305,20 @@ module Classifier
|
|
|
219
305
|
# Convert vectors to arrays for matrix construction
|
|
220
306
|
tda_arrays = tda.map { |v| v.respond_to?(:to_a) ? v.to_a : v }
|
|
221
307
|
tdm = self.class.matrix_class.alloc(*tda_arrays).trans
|
|
222
|
-
ntdm =
|
|
308
|
+
ntdm, u_mat = build_reduced_matrix_with_u(tdm, cutoff)
|
|
223
309
|
assign_native_ext_lsi_vectors(ntdm, doc_list)
|
|
224
310
|
else
|
|
225
311
|
tdm = Matrix.rows(tda).trans
|
|
226
|
-
ntdm =
|
|
312
|
+
ntdm, u_mat = build_reduced_matrix_with_u(tdm, cutoff)
|
|
227
313
|
assign_ruby_lsi_vectors(ntdm, doc_list)
|
|
228
314
|
end
|
|
229
315
|
|
|
316
|
+
# Store U matrix for incremental mode
|
|
317
|
+
if @incremental_mode
|
|
318
|
+
@u_matrix = u_mat
|
|
319
|
+
@initial_vocab_size = @word_list.size
|
|
320
|
+
end
|
|
321
|
+
|
|
230
322
|
@built_at_version = @version
|
|
231
323
|
end
|
|
232
324
|
end
|
|
@@ -532,6 +624,100 @@ module Classifier
|
|
|
532
624
|
from_json(File.read(path))
|
|
533
625
|
end
|
|
534
626
|
|
|
627
|
+
# Loads an LSI index from a checkpoint.
|
|
628
|
+
#
|
|
629
|
+
# @rbs (storage: Storage::Base, checkpoint_id: String) -> LSI
|
|
630
|
+
def self.load_checkpoint(storage:, checkpoint_id:)
|
|
631
|
+
raise ArgumentError, 'Storage must be File storage for checkpoints' unless storage.is_a?(Storage::File)
|
|
632
|
+
|
|
633
|
+
dir = File.dirname(storage.path)
|
|
634
|
+
base = File.basename(storage.path, '.*')
|
|
635
|
+
ext = File.extname(storage.path)
|
|
636
|
+
checkpoint_path = File.join(dir, "#{base}_checkpoint_#{checkpoint_id}#{ext}")
|
|
637
|
+
|
|
638
|
+
checkpoint_storage = Storage::File.new(path: checkpoint_path)
|
|
639
|
+
instance = load(storage: checkpoint_storage)
|
|
640
|
+
instance.storage = storage
|
|
641
|
+
instance
|
|
642
|
+
end
|
|
643
|
+
|
|
644
|
+
# Trains the LSI index from an IO stream.
|
|
645
|
+
# Each line in the stream is treated as a separate document.
|
|
646
|
+
# Documents are added without rebuilding, then the index is rebuilt at the end.
|
|
647
|
+
#
|
|
648
|
+
# @example Train from a file
|
|
649
|
+
# lsi.train_from_stream(:category, File.open('corpus.txt'))
|
|
650
|
+
#
|
|
651
|
+
# @example With progress tracking
|
|
652
|
+
# lsi.train_from_stream(:category, io, batch_size: 500) do |progress|
|
|
653
|
+
# puts "#{progress.completed} documents processed"
|
|
654
|
+
# end
|
|
655
|
+
#
|
|
656
|
+
# @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
|
|
657
|
+
def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
|
|
658
|
+
original_auto_rebuild = @auto_rebuild
|
|
659
|
+
@auto_rebuild = false
|
|
660
|
+
|
|
661
|
+
begin
|
|
662
|
+
reader = Streaming::LineReader.new(io, batch_size: batch_size)
|
|
663
|
+
total = reader.estimate_line_count
|
|
664
|
+
progress = Streaming::Progress.new(total: total)
|
|
665
|
+
|
|
666
|
+
reader.each_batch do |batch|
|
|
667
|
+
batch.each { |text| add_item(text, category) }
|
|
668
|
+
progress.completed += batch.size
|
|
669
|
+
progress.current_batch += 1
|
|
670
|
+
yield progress if block_given?
|
|
671
|
+
end
|
|
672
|
+
ensure
|
|
673
|
+
@auto_rebuild = original_auto_rebuild
|
|
674
|
+
build_index if original_auto_rebuild
|
|
675
|
+
end
|
|
676
|
+
end
|
|
677
|
+
|
|
678
|
+
# Adds items to the index in batches from an array.
|
|
679
|
+
# Documents are added without rebuilding, then the index is rebuilt at the end.
|
|
680
|
+
#
|
|
681
|
+
# @example Batch add with progress
|
|
682
|
+
# lsi.add_batch(Dog: documents, batch_size: 100) do |progress|
|
|
683
|
+
# puts "#{progress.percent}% complete"
|
|
684
|
+
# end
|
|
685
|
+
#
|
|
686
|
+
# @rbs (?batch_size: Integer, **Array[String]) { (Streaming::Progress) -> void } -> void
|
|
687
|
+
def add_batch(batch_size: Streaming::DEFAULT_BATCH_SIZE, **items)
|
|
688
|
+
original_auto_rebuild = @auto_rebuild
|
|
689
|
+
@auto_rebuild = false
|
|
690
|
+
|
|
691
|
+
begin
|
|
692
|
+
total_docs = items.values.sum { |v| Array(v).size }
|
|
693
|
+
progress = Streaming::Progress.new(total: total_docs)
|
|
694
|
+
|
|
695
|
+
items.each do |category, documents|
|
|
696
|
+
Array(documents).each_slice(batch_size) do |batch|
|
|
697
|
+
batch.each { |doc| add_item(doc, category.to_s) }
|
|
698
|
+
progress.completed += batch.size
|
|
699
|
+
progress.current_batch += 1
|
|
700
|
+
yield progress if block_given?
|
|
701
|
+
end
|
|
702
|
+
end
|
|
703
|
+
ensure
|
|
704
|
+
@auto_rebuild = original_auto_rebuild
|
|
705
|
+
build_index if original_auto_rebuild
|
|
706
|
+
end
|
|
707
|
+
end
|
|
708
|
+
|
|
709
|
+
# Alias train_batch to add_batch for API consistency with other classifiers.
|
|
710
|
+
# Note: LSI uses categories differently (items have categories, not the training call).
|
|
711
|
+
#
|
|
712
|
+
# @rbs (?(String | Symbol)?, ?Array[String]?, ?batch_size: Integer, **Array[String]) { (Streaming::Progress) -> void } -> void
|
|
713
|
+
def train_batch(category = nil, documents = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &block)
|
|
714
|
+
if category && documents
|
|
715
|
+
add_batch(batch_size: batch_size, **{ category.to_sym => documents }, &block)
|
|
716
|
+
else
|
|
717
|
+
add_batch(batch_size: batch_size, **categories, &block)
|
|
718
|
+
end
|
|
719
|
+
end
|
|
720
|
+
|
|
535
721
|
private
|
|
536
722
|
|
|
537
723
|
# Restores LSI state from a JSON string (used by reload)
|
|
@@ -602,6 +788,7 @@ module Classifier
|
|
|
602
788
|
# @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
|
|
603
789
|
def proximity_array_for_content_unlocked(doc, &)
|
|
604
790
|
return [] if needs_rebuild_unlocked?
|
|
791
|
+
return @items.keys.map { |item| [item, 1.0] } if @items.size == 1
|
|
605
792
|
|
|
606
793
|
content_node = node_for_content_unlocked(doc, &)
|
|
607
794
|
result =
|
|
@@ -651,7 +838,7 @@ module Classifier
|
|
|
651
838
|
votes
|
|
652
839
|
end
|
|
653
840
|
|
|
654
|
-
# Unlocked version of node_for_content for internal use
|
|
841
|
+
# Unlocked version of node_for_content for internal use.
|
|
655
842
|
# @rbs (String) ?{ (String) -> String } -> ContentNode
|
|
656
843
|
def node_for_content_unlocked(item, &block)
|
|
657
844
|
return @items[item] if @items[item]
|
|
@@ -659,31 +846,68 @@ module Classifier
|
|
|
659
846
|
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
|
660
847
|
cn = ContentNode.new(clean_word_hash, &block)
|
|
661
848
|
cn.raw_vector_with(@word_list) unless needs_rebuild_unlocked?
|
|
849
|
+
assign_lsi_vector_incremental(cn) if incremental_enabled?
|
|
662
850
|
cn
|
|
663
851
|
end
|
|
664
852
|
|
|
665
853
|
# @rbs (untyped, ?Float) -> untyped
|
|
666
854
|
def build_reduced_matrix(matrix, cutoff = 0.75)
|
|
667
|
-
|
|
668
|
-
|
|
855
|
+
result, _u = build_reduced_matrix_with_u(matrix, cutoff)
|
|
856
|
+
result
|
|
857
|
+
end
|
|
669
858
|
|
|
670
|
-
|
|
859
|
+
# Builds reduced matrix and returns both the result and the U matrix.
|
|
860
|
+
# U matrix is needed for incremental SVD updates.
|
|
861
|
+
# @rbs (untyped, ?Float) -> [untyped, Matrix]
|
|
862
|
+
def build_reduced_matrix_with_u(matrix, cutoff = 0.75)
|
|
863
|
+
u, v, s = matrix.SV_decomp
|
|
671
864
|
|
|
865
|
+
all_singular_values = s.sort.reverse
|
|
672
866
|
s_cutoff_index = [(s.size * cutoff).round - 1, 0].max
|
|
673
|
-
s_cutoff =
|
|
867
|
+
s_cutoff = all_singular_values[s_cutoff_index]
|
|
868
|
+
|
|
869
|
+
kept_indices = []
|
|
870
|
+
kept_singular_values = []
|
|
674
871
|
s.size.times do |ord|
|
|
675
|
-
|
|
872
|
+
if s[ord] >= s_cutoff
|
|
873
|
+
kept_indices << ord
|
|
874
|
+
kept_singular_values << s[ord]
|
|
875
|
+
else
|
|
876
|
+
s[ord] = 0.0
|
|
877
|
+
end
|
|
676
878
|
end
|
|
677
|
-
# Reconstruct the term document matrix, only with reduced rank
|
|
678
|
-
result = u * self.class.matrix_class.diag(s) * v.trans
|
|
679
879
|
|
|
680
|
-
|
|
681
|
-
|
|
880
|
+
@singular_values = kept_singular_values.sort.reverse
|
|
881
|
+
result = u * self.class.matrix_class.diag(s) * v.trans
|
|
682
882
|
result = result.trans if result.row_size != matrix.row_size
|
|
883
|
+
u_reduced = extract_reduced_u(u, kept_indices, s)
|
|
683
884
|
|
|
684
|
-
result
|
|
885
|
+
[result, u_reduced]
|
|
685
886
|
end
|
|
686
887
|
|
|
888
|
+
# Extracts columns from U corresponding to kept singular values.
|
|
889
|
+
# Columns are sorted by descending singular value to match @singular_values order.
|
|
890
|
+
# rubocop:disable Naming/MethodParameterName
|
|
891
|
+
# @rbs (untyped, Array[Integer], Array[Float]) -> Matrix
|
|
892
|
+
def extract_reduced_u(u, kept_indices, singular_values)
|
|
893
|
+
return Matrix.empty(u.row_size, 0) if kept_indices.empty?
|
|
894
|
+
|
|
895
|
+
sorted_indices = kept_indices.sort_by { |i| -singular_values[i] }
|
|
896
|
+
|
|
897
|
+
if u.respond_to?(:to_ruby_matrix)
|
|
898
|
+
u = u.to_ruby_matrix
|
|
899
|
+
elsif !u.is_a?(::Matrix)
|
|
900
|
+
rows = u.row_size.times.map do |i|
|
|
901
|
+
sorted_indices.map { |j| u[i, j] }
|
|
902
|
+
end
|
|
903
|
+
return Matrix.rows(rows)
|
|
904
|
+
end
|
|
905
|
+
|
|
906
|
+
cols = sorted_indices.map { |i| u.column(i).to_a }
|
|
907
|
+
Matrix.columns(cols)
|
|
908
|
+
end
|
|
909
|
+
# rubocop:enable Naming/MethodParameterName
|
|
910
|
+
|
|
687
911
|
# @rbs () -> void
|
|
688
912
|
def make_word_list
|
|
689
913
|
@word_list = WordList.new
|
|
@@ -691,5 +915,129 @@ module Classifier
|
|
|
691
915
|
node.word_hash.each_key { |key| @word_list.add_word key }
|
|
692
916
|
end
|
|
693
917
|
end
|
|
918
|
+
|
|
919
|
+
# Performs incremental SVD update for a new document.
|
|
920
|
+
# @rbs (ContentNode, Hash[Symbol, Integer]) -> void
|
|
921
|
+
def perform_incremental_update(node, word_hash)
|
|
922
|
+
needs_full_rebuild = false
|
|
923
|
+
old_rank = nil
|
|
924
|
+
|
|
925
|
+
synchronize do
|
|
926
|
+
if vocabulary_growth_exceeds_threshold?(word_hash)
|
|
927
|
+
disable_incremental_mode!
|
|
928
|
+
needs_full_rebuild = true
|
|
929
|
+
next
|
|
930
|
+
end
|
|
931
|
+
|
|
932
|
+
old_rank = @u_matrix.column_size
|
|
933
|
+
extend_vocabulary_for_incremental(word_hash)
|
|
934
|
+
raw_vec = node.raw_vector_with(@word_list)
|
|
935
|
+
raw_vector = Vector[*raw_vec.to_a]
|
|
936
|
+
|
|
937
|
+
@u_matrix, @singular_values = IncrementalSVD.update(
|
|
938
|
+
@u_matrix, @singular_values, raw_vector, max_rank: @max_rank
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
new_rank = @u_matrix.column_size
|
|
942
|
+
if new_rank > old_rank
|
|
943
|
+
reproject_all_documents
|
|
944
|
+
else
|
|
945
|
+
assign_lsi_vector_incremental(node)
|
|
946
|
+
end
|
|
947
|
+
|
|
948
|
+
@built_at_version = @version
|
|
949
|
+
end
|
|
950
|
+
|
|
951
|
+
build_index if needs_full_rebuild
|
|
952
|
+
end
|
|
953
|
+
|
|
954
|
+
# Checks if vocabulary growth would exceed threshold (20%)
|
|
955
|
+
# @rbs (Hash[Symbol, Integer]) -> bool
|
|
956
|
+
def vocabulary_growth_exceeds_threshold?(word_hash)
|
|
957
|
+
return false unless @initial_vocab_size&.positive?
|
|
958
|
+
|
|
959
|
+
new_words = word_hash.keys.count { |w| @word_list[w].nil? }
|
|
960
|
+
growth_ratio = new_words.to_f / @initial_vocab_size
|
|
961
|
+
growth_ratio > 0.2
|
|
962
|
+
end
|
|
963
|
+
|
|
964
|
+
# Extends vocabulary and U matrix for new words.
|
|
965
|
+
# @rbs (Hash[Symbol, Integer]) -> void
|
|
966
|
+
def extend_vocabulary_for_incremental(word_hash)
|
|
967
|
+
new_words = word_hash.keys.select { |w| @word_list[w].nil? }
|
|
968
|
+
return if new_words.empty?
|
|
969
|
+
|
|
970
|
+
new_words.each { |word| @word_list.add_word(word) }
|
|
971
|
+
extend_u_matrix(new_words.size)
|
|
972
|
+
end
|
|
973
|
+
|
|
974
|
+
# Extends U matrix with zero rows for new vocabulary terms.
|
|
975
|
+
# @rbs (Integer) -> void
|
|
976
|
+
def extend_u_matrix(num_new_rows)
|
|
977
|
+
return if num_new_rows.zero? || @u_matrix.nil?
|
|
978
|
+
|
|
979
|
+
if self.class.native_available? && @u_matrix.is_a?(self.class.matrix_class)
|
|
980
|
+
new_rows = self.class.matrix_class.zeros(num_new_rows, @u_matrix.column_size)
|
|
981
|
+
@u_matrix = self.class.matrix_class.vstack(@u_matrix, new_rows)
|
|
982
|
+
else
|
|
983
|
+
new_rows = Matrix.zero(num_new_rows, @u_matrix.column_size)
|
|
984
|
+
@u_matrix = Matrix.vstack(@u_matrix, new_rows)
|
|
985
|
+
end
|
|
986
|
+
end
|
|
987
|
+
|
|
988
|
+
# Re-projects all documents onto the current U matrix
|
|
989
|
+
# Called when rank grows to ensure consistent LSI vector sizes
|
|
990
|
+
# Uses native batch_project for performance when available
|
|
991
|
+
# @rbs () -> void
|
|
992
|
+
def reproject_all_documents
|
|
993
|
+
return unless @u_matrix
|
|
994
|
+
return reproject_all_documents_native if self.class.native_available? && @u_matrix.respond_to?(:batch_project)
|
|
995
|
+
|
|
996
|
+
reproject_all_documents_ruby
|
|
997
|
+
end
|
|
998
|
+
|
|
999
|
+
# Native batch re-projection using C extension.
|
|
1000
|
+
# @rbs () -> void
|
|
1001
|
+
def reproject_all_documents_native
|
|
1002
|
+
nodes = @items.values
|
|
1003
|
+
raw_vectors = nodes.map do |node|
|
|
1004
|
+
raw = node.raw_vector_with(@word_list)
|
|
1005
|
+
raw.is_a?(self.class.vector_class) ? raw : self.class.vector_class.alloc(raw.to_a)
|
|
1006
|
+
end
|
|
1007
|
+
|
|
1008
|
+
lsi_vectors = @u_matrix.batch_project(raw_vectors)
|
|
1009
|
+
|
|
1010
|
+
nodes.each_with_index do |node, i|
|
|
1011
|
+
lsi_vec = lsi_vectors[i].row
|
|
1012
|
+
node.lsi_vector = lsi_vec
|
|
1013
|
+
node.lsi_norm = lsi_vec.normalize
|
|
1014
|
+
end
|
|
1015
|
+
end
|
|
1016
|
+
|
|
1017
|
+
# Pure Ruby re-projection (fallback)
|
|
1018
|
+
# @rbs () -> void
|
|
1019
|
+
def reproject_all_documents_ruby
|
|
1020
|
+
@items.each_value do |node|
|
|
1021
|
+
assign_lsi_vector_incremental(node)
|
|
1022
|
+
end
|
|
1023
|
+
end
|
|
1024
|
+
|
|
1025
|
+
# Assigns LSI vector to a node using projection: lsi_vec = U^T * raw_vec.
|
|
1026
|
+
# @rbs (ContentNode) -> void
|
|
1027
|
+
def assign_lsi_vector_incremental(node)
|
|
1028
|
+
return unless @u_matrix
|
|
1029
|
+
|
|
1030
|
+
raw_vec = node.raw_vector_with(@word_list)
|
|
1031
|
+
raw_vector = Vector[*raw_vec.to_a]
|
|
1032
|
+
lsi_arr = (@u_matrix.transpose * raw_vector).to_a
|
|
1033
|
+
|
|
1034
|
+
lsi_vec = if self.class.native_available?
|
|
1035
|
+
self.class.vector_class.alloc(lsi_arr).row
|
|
1036
|
+
else
|
|
1037
|
+
Vector[*lsi_arr]
|
|
1038
|
+
end
|
|
1039
|
+
node.lsi_vector = lsi_vec
|
|
1040
|
+
node.lsi_norm = lsi_vec.normalize
|
|
1041
|
+
end
|
|
694
1042
|
end
|
|
695
1043
|
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
module Classifier
|
|
4
|
+
module Streaming
|
|
5
|
+
# Memory-efficient line reader for large files and IO streams.
|
|
6
|
+
# Reads lines one at a time and can yield in configurable batches.
|
|
7
|
+
#
|
|
8
|
+
# @example Reading line by line
|
|
9
|
+
# reader = LineReader.new(File.open('large_corpus.txt'))
|
|
10
|
+
# reader.each { |line| process(line) }
|
|
11
|
+
#
|
|
12
|
+
# @example Reading in batches
|
|
13
|
+
# reader = LineReader.new(io, batch_size: 100)
|
|
14
|
+
# reader.each_batch { |batch| process_batch(batch) }
|
|
15
|
+
class LineReader
|
|
16
|
+
include Enumerable #[String]
|
|
17
|
+
|
|
18
|
+
# @rbs @io: IO
|
|
19
|
+
# @rbs @batch_size: Integer
|
|
20
|
+
|
|
21
|
+
attr_reader :batch_size
|
|
22
|
+
|
|
23
|
+
# Creates a new LineReader.
|
|
24
|
+
#
|
|
25
|
+
# @rbs (IO, ?batch_size: Integer) -> void
|
|
26
|
+
def initialize(io, batch_size: 100)
|
|
27
|
+
@io = io
|
|
28
|
+
@batch_size = batch_size
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Iterates over each line in the IO stream.
|
|
32
|
+
# Lines are chomped (trailing newlines removed).
|
|
33
|
+
#
|
|
34
|
+
# @rbs () { (String) -> void } -> void
|
|
35
|
+
# @rbs () -> Enumerator[String, void]
|
|
36
|
+
def each
|
|
37
|
+
return enum_for(:each) unless block_given?
|
|
38
|
+
|
|
39
|
+
@io.each_line do |line|
|
|
40
|
+
yield line.chomp
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Iterates over batches of lines.
|
|
45
|
+
# Each batch is an array of chomped lines.
|
|
46
|
+
#
|
|
47
|
+
# @rbs () { (Array[String]) -> void } -> void
|
|
48
|
+
# @rbs () -> Enumerator[Array[String], void]
|
|
49
|
+
def each_batch
|
|
50
|
+
return enum_for(:each_batch) unless block_given?
|
|
51
|
+
|
|
52
|
+
batch = [] #: Array[String]
|
|
53
|
+
each do |line|
|
|
54
|
+
batch << line
|
|
55
|
+
if batch.size >= @batch_size
|
|
56
|
+
yield batch
|
|
57
|
+
batch = []
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
yield batch unless batch.empty?
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Estimates the total number of lines in the IO stream.
|
|
64
|
+
# This is a rough estimate based on file size and average line length.
|
|
65
|
+
# Returns nil for non-seekable streams.
|
|
66
|
+
#
|
|
67
|
+
# @rbs (?sample_size: Integer) -> Integer?
|
|
68
|
+
def estimate_line_count(sample_size: 100)
|
|
69
|
+
return nil unless @io.respond_to?(:size) && @io.respond_to?(:rewind)
|
|
70
|
+
|
|
71
|
+
begin
|
|
72
|
+
original_pos = @io.pos
|
|
73
|
+
@io.rewind
|
|
74
|
+
|
|
75
|
+
sample_bytes = 0
|
|
76
|
+
sample_lines = 0
|
|
77
|
+
|
|
78
|
+
sample_size.times do
|
|
79
|
+
line = @io.gets
|
|
80
|
+
break unless line
|
|
81
|
+
|
|
82
|
+
sample_bytes += line.bytesize
|
|
83
|
+
sample_lines += 1
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
@io.seek(original_pos)
|
|
87
|
+
|
|
88
|
+
return nil if sample_lines.zero?
|
|
89
|
+
|
|
90
|
+
avg_line_size = sample_bytes.to_f / sample_lines
|
|
91
|
+
io_size = @io.__send__(:size) #: Integer
|
|
92
|
+
(io_size / avg_line_size).round
|
|
93
|
+
rescue IOError, Errno::ESPIPE
|
|
94
|
+
nil
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
module Classifier
|
|
4
|
+
module Streaming
|
|
5
|
+
# Progress tracking object yielded to blocks during batch/stream operations.
|
|
6
|
+
# Provides information about training progress including completion percentage,
|
|
7
|
+
# elapsed time, processing rate, and estimated time remaining.
|
|
8
|
+
#
|
|
9
|
+
# @example Basic usage with train_batch
|
|
10
|
+
# classifier.train_batch(:spam, documents, batch_size: 100) do |progress|
|
|
11
|
+
# puts "#{progress.completed}/#{progress.total} (#{progress.percent}%)"
|
|
12
|
+
# puts "Rate: #{progress.rate.round(1)} docs/sec"
|
|
13
|
+
# puts "ETA: #{progress.eta&.round}s" if progress.eta
|
|
14
|
+
# end
|
|
15
|
+
class Progress
|
|
16
|
+
# @rbs @completed: Integer
|
|
17
|
+
# @rbs @total: Integer?
|
|
18
|
+
# @rbs @start_time: Time
|
|
19
|
+
# @rbs @current_batch: Integer
|
|
20
|
+
|
|
21
|
+
attr_reader :start_time, :total
|
|
22
|
+
attr_accessor :completed, :current_batch
|
|
23
|
+
|
|
24
|
+
# @rbs (?total: Integer?, ?completed: Integer) -> void
|
|
25
|
+
def initialize(total: nil, completed: 0)
|
|
26
|
+
@completed = completed
|
|
27
|
+
@total = total
|
|
28
|
+
@start_time = Time.now
|
|
29
|
+
@current_batch = 0
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Returns the completion percentage (0-100).
|
|
33
|
+
# Returns nil if total is unknown.
|
|
34
|
+
#
|
|
35
|
+
# @rbs () -> Float?
|
|
36
|
+
def percent
|
|
37
|
+
return nil unless @total&.positive?
|
|
38
|
+
|
|
39
|
+
(@completed.to_f / @total * 100).round(2)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Returns the elapsed time in seconds since the operation started.
|
|
43
|
+
#
|
|
44
|
+
# @rbs () -> Float
|
|
45
|
+
def elapsed
|
|
46
|
+
Time.now - @start_time
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Returns the processing rate in items per second.
|
|
50
|
+
# Returns 0 if no time has elapsed.
|
|
51
|
+
#
|
|
52
|
+
# @rbs () -> Float
|
|
53
|
+
def rate
|
|
54
|
+
e = elapsed
|
|
55
|
+
return 0.0 if e.zero?
|
|
56
|
+
|
|
57
|
+
@completed / e
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Returns the estimated time remaining in seconds.
|
|
61
|
+
# Returns nil if total is unknown or rate is zero.
|
|
62
|
+
#
|
|
63
|
+
# @rbs () -> Float?
|
|
64
|
+
def eta
|
|
65
|
+
return nil unless @total
|
|
66
|
+
return nil if rate.zero?
|
|
67
|
+
return 0.0 if @completed >= @total
|
|
68
|
+
|
|
69
|
+
(@total - @completed) / rate
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Returns true if the operation is complete.
|
|
73
|
+
#
|
|
74
|
+
# @rbs () -> bool
|
|
75
|
+
def complete?
|
|
76
|
+
return false unless @total
|
|
77
|
+
|
|
78
|
+
@completed >= @total
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Returns a hash representation of the progress state.
|
|
82
|
+
#
|
|
83
|
+
# @rbs () -> Hash[Symbol, untyped]
|
|
84
|
+
def to_h
|
|
85
|
+
{
|
|
86
|
+
completed: @completed,
|
|
87
|
+
total: @total,
|
|
88
|
+
percent: percent,
|
|
89
|
+
elapsed: elapsed.round(2),
|
|
90
|
+
rate: rate.round(2),
|
|
91
|
+
eta: eta&.round(2)
|
|
92
|
+
}
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|