classifier 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -58,6 +58,7 @@ require 'mutex_m'
58
58
  require 'classifier/lsi/word_list'
59
59
  require 'classifier/lsi/content_node'
60
60
  require 'classifier/lsi/summary'
61
+ require 'classifier/lsi/incremental_svd'
61
62
 
62
63
  module Classifier
63
64
  # This class implements a Latent Semantic Indexer, which can search, classify and cluster
@@ -65,6 +66,7 @@ module Classifier
65
66
  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
66
67
  class LSI
67
68
  include Mutex_m
69
+ include Streaming
68
70
 
69
71
  # @rbs @auto_rebuild: bool
70
72
  # @rbs @word_list: WordList
@@ -74,14 +76,24 @@ module Classifier
74
76
  # @rbs @singular_values: Array[Float]?
75
77
  # @rbs @dirty: bool
76
78
  # @rbs @storage: Storage::Base?
79
+ # @rbs @incremental_mode: bool
80
+ # @rbs @u_matrix: Matrix?
81
+ # @rbs @max_rank: Integer
82
+ # @rbs @initial_vocab_size: Integer?
77
83
 
78
84
  attr_reader :word_list, :singular_values
79
85
  attr_accessor :auto_rebuild, :storage
80
86
 
87
+ # Default maximum rank for incremental SVD
88
+ DEFAULT_MAX_RANK = 100
89
+
81
90
  # Create a fresh index.
82
91
  # If you want to call #build_index manually, use
83
92
  # Classifier::LSI.new auto_rebuild: false
84
93
  #
94
+ # For incremental SVD mode (adds documents without full rebuild):
95
+ # Classifier::LSI.new incremental: true, max_rank: 100
96
+ #
85
97
  # @rbs (?Hash[Symbol, untyped]) -> void
86
98
  def initialize(options = {})
87
99
  super()
@@ -92,6 +104,12 @@ module Classifier
92
104
  @built_at_version = -1
93
105
  @dirty = false
94
106
  @storage = nil
107
+
108
+ # Incremental SVD settings
109
+ @incremental_mode = options[:incremental] == true
110
+ @max_rank = options[:max_rank] || DEFAULT_MAX_RANK
111
+ @u_matrix = nil
112
+ @initial_vocab_size = nil
95
113
  end
96
114
 
97
115
  # Returns true if the index needs to be rebuilt. The index needs
@@ -122,12 +140,73 @@ module Classifier
122
140
  end
123
141
  end
124
142
 
143
+ # Returns true if incremental mode is enabled and active.
144
+ # Incremental mode becomes active after the first build_index call.
145
+ #
146
+ # @rbs () -> bool
147
+ def incremental_enabled?
148
+ @incremental_mode && !@u_matrix.nil?
149
+ end
150
+
151
+ # Returns the current rank of the incremental SVD (number of singular values kept).
152
+ # Returns nil if incremental mode is not active.
153
+ #
154
+ # @rbs () -> Integer?
155
+ def current_rank
156
+ @singular_values&.count(&:positive?)
157
+ end
158
+
159
+ # Disables incremental mode. Subsequent adds will trigger full rebuilds.
160
+ #
161
+ # @rbs () -> void
162
+ def disable_incremental_mode!
163
+ @incremental_mode = false
164
+ @u_matrix = nil
165
+ @initial_vocab_size = nil
166
+ end
167
+
168
+ # Enables incremental mode with optional max_rank setting.
169
+ # The next build_index call will store the U matrix for incremental updates.
170
+ #
171
+ # @rbs (?max_rank: Integer) -> void
172
+ def enable_incremental_mode!(max_rank: DEFAULT_MAX_RANK)
173
+ @incremental_mode = true
174
+ @max_rank = max_rank
175
+ end
176
+
177
+ # Adds items to the index using hash-style syntax.
178
+ # The hash keys are categories, and values are items (or arrays of items).
179
+ #
180
+ # For example:
181
+ # lsi = Classifier::LSI.new
182
+ # lsi.add("Dog" => "Dogs are loyal pets")
183
+ # lsi.add("Cat" => "Cats are independent")
184
+ # lsi.add(Bird: "Birds can fly") # Symbol keys work too
185
+ #
186
+ # Multiple items with the same category:
187
+ # lsi.add("Dog" => ["Dogs are loyal", "Puppies are cute"])
188
+ #
189
+ # Batch operations with multiple categories:
190
+ # lsi.add(
191
+ # "Dog" => ["Dogs are loyal", "Puppies are cute"],
192
+ # "Cat" => ["Cats are independent", "Kittens are playful"]
193
+ # )
194
+ #
195
+ # @rbs (**untyped items) -> void
196
+ def add(**items)
197
+ items.each do |category, value|
198
+ Array(value).each { |doc| add_item(doc, category.to_s) }
199
+ end
200
+ end
201
+
125
202
  # Adds an item to the index. item is assumed to be a string, but
126
203
  # any item may be indexed so long as it responds to #to_s or if
127
204
  # you provide an optional block explaining how the indexer can
128
205
  # fetch fresh string data. This optional block is passed the item,
129
206
  # so the item may only be a reference to a URL or file name.
130
207
  #
208
+ # @deprecated Use {#add} instead for clearer hash-style syntax.
209
+ #
131
210
  # For example:
132
211
  # lsi = Classifier::LSI.new
133
212
  # lsi.add_item "This is just plain text"
@@ -138,11 +217,18 @@ module Classifier
138
217
  # @rbs (String, *String | Symbol) ?{ (String) -> String } -> void
139
218
  def add_item(item, *categories, &block)
140
219
  clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
220
+ node = nil
221
+
141
222
  synchronize do
142
- @items[item] = ContentNode.new(clean_word_hash, *categories)
223
+ node = ContentNode.new(clean_word_hash, *categories)
224
+ @items[item] = node
143
225
  @version += 1
144
226
  @dirty = true
145
227
  end
228
+
229
+ # Use incremental update if enabled and we have a U matrix
230
+ return perform_incremental_update(node, clean_word_hash) if @incremental_mode && @u_matrix
231
+
146
232
  build_index if @auto_rebuild
147
233
  end
148
234
 
@@ -203,12 +289,12 @@ module Classifier
203
289
  # A value of 1 for cutoff means that no semantic analysis will take place,
204
290
  # turning the LSI class into a simple vector search engine.
205
291
  #
206
- # @rbs (?Float) -> void
207
- def build_index(cutoff = 0.75)
292
+ # @rbs (?Float, ?force: bool) -> void
293
+ def build_index(cutoff = 0.75, force: false)
208
294
  validate_cutoff!(cutoff)
209
295
 
210
296
  synchronize do
211
- return unless needs_rebuild_unlocked?
297
+ return unless force || needs_rebuild_unlocked?
212
298
 
213
299
  make_word_list
214
300
 
@@ -219,14 +305,20 @@ module Classifier
219
305
  # Convert vectors to arrays for matrix construction
220
306
  tda_arrays = tda.map { |v| v.respond_to?(:to_a) ? v.to_a : v }
221
307
  tdm = self.class.matrix_class.alloc(*tda_arrays).trans
222
- ntdm = build_reduced_matrix(tdm, cutoff)
308
+ ntdm, u_mat = build_reduced_matrix_with_u(tdm, cutoff)
223
309
  assign_native_ext_lsi_vectors(ntdm, doc_list)
224
310
  else
225
311
  tdm = Matrix.rows(tda).trans
226
- ntdm = build_reduced_matrix(tdm, cutoff)
312
+ ntdm, u_mat = build_reduced_matrix_with_u(tdm, cutoff)
227
313
  assign_ruby_lsi_vectors(ntdm, doc_list)
228
314
  end
229
315
 
316
+ # Store U matrix for incremental mode
317
+ if @incremental_mode
318
+ @u_matrix = u_mat
319
+ @initial_vocab_size = @word_list.size
320
+ end
321
+
230
322
  @built_at_version = @version
231
323
  end
232
324
  end
@@ -532,6 +624,100 @@ module Classifier
532
624
  from_json(File.read(path))
533
625
  end
534
626
 
627
+ # Loads an LSI index from a checkpoint.
628
+ #
629
+ # @rbs (storage: Storage::Base, checkpoint_id: String) -> LSI
630
+ def self.load_checkpoint(storage:, checkpoint_id:)
631
+ raise ArgumentError, 'Storage must be File storage for checkpoints' unless storage.is_a?(Storage::File)
632
+
633
+ dir = File.dirname(storage.path)
634
+ base = File.basename(storage.path, '.*')
635
+ ext = File.extname(storage.path)
636
+ checkpoint_path = File.join(dir, "#{base}_checkpoint_#{checkpoint_id}#{ext}")
637
+
638
+ checkpoint_storage = Storage::File.new(path: checkpoint_path)
639
+ instance = load(storage: checkpoint_storage)
640
+ instance.storage = storage
641
+ instance
642
+ end
643
+
644
+ # Trains the LSI index from an IO stream.
645
+ # Each line in the stream is treated as a separate document.
646
+ # Documents are added without rebuilding, then the index is rebuilt at the end.
647
+ #
648
+ # @example Train from a file
649
+ # lsi.train_from_stream(:category, File.open('corpus.txt'))
650
+ #
651
+ # @example With progress tracking
652
+ # lsi.train_from_stream(:category, io, batch_size: 500) do |progress|
653
+ # puts "#{progress.completed} documents processed"
654
+ # end
655
+ #
656
+ # @rbs (String | Symbol, IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
657
+ def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
658
+ original_auto_rebuild = @auto_rebuild
659
+ @auto_rebuild = false
660
+
661
+ begin
662
+ reader = Streaming::LineReader.new(io, batch_size: batch_size)
663
+ total = reader.estimate_line_count
664
+ progress = Streaming::Progress.new(total: total)
665
+
666
+ reader.each_batch do |batch|
667
+ batch.each { |text| add_item(text, category) }
668
+ progress.completed += batch.size
669
+ progress.current_batch += 1
670
+ yield progress if block_given?
671
+ end
672
+ ensure
673
+ @auto_rebuild = original_auto_rebuild
674
+ build_index if original_auto_rebuild
675
+ end
676
+ end
677
+
678
+ # Adds items to the index in batches from an array.
679
+ # Documents are added without rebuilding, then the index is rebuilt at the end.
680
+ #
681
+ # @example Batch add with progress
682
+ # lsi.add_batch(Dog: documents, batch_size: 100) do |progress|
683
+ # puts "#{progress.percent}% complete"
684
+ # end
685
+ #
686
+ # @rbs (?batch_size: Integer, **Array[String]) { (Streaming::Progress) -> void } -> void
687
+ def add_batch(batch_size: Streaming::DEFAULT_BATCH_SIZE, **items)
688
+ original_auto_rebuild = @auto_rebuild
689
+ @auto_rebuild = false
690
+
691
+ begin
692
+ total_docs = items.values.sum { |v| Array(v).size }
693
+ progress = Streaming::Progress.new(total: total_docs)
694
+
695
+ items.each do |category, documents|
696
+ Array(documents).each_slice(batch_size) do |batch|
697
+ batch.each { |doc| add_item(doc, category.to_s) }
698
+ progress.completed += batch.size
699
+ progress.current_batch += 1
700
+ yield progress if block_given?
701
+ end
702
+ end
703
+ ensure
704
+ @auto_rebuild = original_auto_rebuild
705
+ build_index if original_auto_rebuild
706
+ end
707
+ end
708
+
709
+ # Alias train_batch to add_batch for API consistency with other classifiers.
710
+ # Note: LSI uses categories differently (items have categories, not the training call).
711
+ #
712
+ # @rbs (?(String | Symbol)?, ?Array[String]?, ?batch_size: Integer, **Array[String]) { (Streaming::Progress) -> void } -> void
713
+ def train_batch(category = nil, documents = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &block)
714
+ if category && documents
715
+ add_batch(batch_size: batch_size, **{ category.to_sym => documents }, &block)
716
+ else
717
+ add_batch(batch_size: batch_size, **categories, &block)
718
+ end
719
+ end
720
+
535
721
  private
536
722
 
537
723
  # Restores LSI state from a JSON string (used by reload)
@@ -602,6 +788,7 @@ module Classifier
602
788
  # @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
603
789
  def proximity_array_for_content_unlocked(doc, &)
604
790
  return [] if needs_rebuild_unlocked?
791
+ return @items.keys.map { |item| [item, 1.0] } if @items.size == 1
605
792
 
606
793
  content_node = node_for_content_unlocked(doc, &)
607
794
  result =
@@ -651,7 +838,7 @@ module Classifier
651
838
  votes
652
839
  end
653
840
 
654
- # Unlocked version of node_for_content for internal use
841
+ # Unlocked version of node_for_content for internal use.
655
842
  # @rbs (String) ?{ (String) -> String } -> ContentNode
656
843
  def node_for_content_unlocked(item, &block)
657
844
  return @items[item] if @items[item]
@@ -659,31 +846,68 @@ module Classifier
659
846
  clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
660
847
  cn = ContentNode.new(clean_word_hash, &block)
661
848
  cn.raw_vector_with(@word_list) unless needs_rebuild_unlocked?
849
+ assign_lsi_vector_incremental(cn) if incremental_enabled?
662
850
  cn
663
851
  end
664
852
 
665
853
  # @rbs (untyped, ?Float) -> untyped
666
854
  def build_reduced_matrix(matrix, cutoff = 0.75)
667
- # TODO: Check that M>=N on these dimensions! Transpose helps assure this
668
- u, v, s = matrix.SV_decomp
855
+ result, _u = build_reduced_matrix_with_u(matrix, cutoff)
856
+ result
857
+ end
669
858
 
670
- @singular_values = s.sort.reverse
859
+ # Builds reduced matrix and returns both the result and the U matrix.
860
+ # U matrix is needed for incremental SVD updates.
861
+ # @rbs (untyped, ?Float) -> [untyped, Matrix]
862
+ def build_reduced_matrix_with_u(matrix, cutoff = 0.75)
863
+ u, v, s = matrix.SV_decomp
671
864
 
865
+ all_singular_values = s.sort.reverse
672
866
  s_cutoff_index = [(s.size * cutoff).round - 1, 0].max
673
- s_cutoff = @singular_values[s_cutoff_index]
867
+ s_cutoff = all_singular_values[s_cutoff_index]
868
+
869
+ kept_indices = []
870
+ kept_singular_values = []
674
871
  s.size.times do |ord|
675
- s[ord] = 0.0 if s[ord] < s_cutoff
872
+ if s[ord] >= s_cutoff
873
+ kept_indices << ord
874
+ kept_singular_values << s[ord]
875
+ else
876
+ s[ord] = 0.0
877
+ end
676
878
  end
677
- # Reconstruct the term document matrix, only with reduced rank
678
- result = u * self.class.matrix_class.diag(s) * v.trans
679
879
 
680
- # SVD may return transposed dimensions when row_size < column_size
681
- # Ensure result matches input dimensions
880
+ @singular_values = kept_singular_values.sort.reverse
881
+ result = u * self.class.matrix_class.diag(s) * v.trans
682
882
  result = result.trans if result.row_size != matrix.row_size
883
+ u_reduced = extract_reduced_u(u, kept_indices, s)
683
884
 
684
- result
885
+ [result, u_reduced]
685
886
  end
686
887
 
888
+ # Extracts columns from U corresponding to kept singular values.
889
+ # Columns are sorted by descending singular value to match @singular_values order.
890
+ # rubocop:disable Naming/MethodParameterName
891
+ # @rbs (untyped, Array[Integer], Array[Float]) -> Matrix
892
+ def extract_reduced_u(u, kept_indices, singular_values)
893
+ return Matrix.empty(u.row_size, 0) if kept_indices.empty?
894
+
895
+ sorted_indices = kept_indices.sort_by { |i| -singular_values[i] }
896
+
897
+ if u.respond_to?(:to_ruby_matrix)
898
+ u = u.to_ruby_matrix
899
+ elsif !u.is_a?(::Matrix)
900
+ rows = u.row_size.times.map do |i|
901
+ sorted_indices.map { |j| u[i, j] }
902
+ end
903
+ return Matrix.rows(rows)
904
+ end
905
+
906
+ cols = sorted_indices.map { |i| u.column(i).to_a }
907
+ Matrix.columns(cols)
908
+ end
909
+ # rubocop:enable Naming/MethodParameterName
910
+
687
911
  # @rbs () -> void
688
912
  def make_word_list
689
913
  @word_list = WordList.new
@@ -691,5 +915,129 @@ module Classifier
691
915
  node.word_hash.each_key { |key| @word_list.add_word key }
692
916
  end
693
917
  end
918
+
919
+ # Performs incremental SVD update for a new document.
920
+ # @rbs (ContentNode, Hash[Symbol, Integer]) -> void
921
+ def perform_incremental_update(node, word_hash)
922
+ needs_full_rebuild = false
923
+ old_rank = nil
924
+
925
+ synchronize do
926
+ if vocabulary_growth_exceeds_threshold?(word_hash)
927
+ disable_incremental_mode!
928
+ needs_full_rebuild = true
929
+ next
930
+ end
931
+
932
+ old_rank = @u_matrix.column_size
933
+ extend_vocabulary_for_incremental(word_hash)
934
+ raw_vec = node.raw_vector_with(@word_list)
935
+ raw_vector = Vector[*raw_vec.to_a]
936
+
937
+ @u_matrix, @singular_values = IncrementalSVD.update(
938
+ @u_matrix, @singular_values, raw_vector, max_rank: @max_rank
939
+ )
940
+
941
+ new_rank = @u_matrix.column_size
942
+ if new_rank > old_rank
943
+ reproject_all_documents
944
+ else
945
+ assign_lsi_vector_incremental(node)
946
+ end
947
+
948
+ @built_at_version = @version
949
+ end
950
+
951
+ build_index if needs_full_rebuild
952
+ end
953
+
954
+ # Checks if vocabulary growth would exceed threshold (20%)
955
+ # @rbs (Hash[Symbol, Integer]) -> bool
956
+ def vocabulary_growth_exceeds_threshold?(word_hash)
957
+ return false unless @initial_vocab_size&.positive?
958
+
959
+ new_words = word_hash.keys.count { |w| @word_list[w].nil? }
960
+ growth_ratio = new_words.to_f / @initial_vocab_size
961
+ growth_ratio > 0.2
962
+ end
963
+
964
+ # Extends vocabulary and U matrix for new words.
965
+ # @rbs (Hash[Symbol, Integer]) -> void
966
+ def extend_vocabulary_for_incremental(word_hash)
967
+ new_words = word_hash.keys.select { |w| @word_list[w].nil? }
968
+ return if new_words.empty?
969
+
970
+ new_words.each { |word| @word_list.add_word(word) }
971
+ extend_u_matrix(new_words.size)
972
+ end
973
+
974
+ # Extends U matrix with zero rows for new vocabulary terms.
975
+ # @rbs (Integer) -> void
976
+ def extend_u_matrix(num_new_rows)
977
+ return if num_new_rows.zero? || @u_matrix.nil?
978
+
979
+ if self.class.native_available? && @u_matrix.is_a?(self.class.matrix_class)
980
+ new_rows = self.class.matrix_class.zeros(num_new_rows, @u_matrix.column_size)
981
+ @u_matrix = self.class.matrix_class.vstack(@u_matrix, new_rows)
982
+ else
983
+ new_rows = Matrix.zero(num_new_rows, @u_matrix.column_size)
984
+ @u_matrix = Matrix.vstack(@u_matrix, new_rows)
985
+ end
986
+ end
987
+
988
+ # Re-projects all documents onto the current U matrix
989
+ # Called when rank grows to ensure consistent LSI vector sizes
990
+ # Uses native batch_project for performance when available
991
+ # @rbs () -> void
992
+ def reproject_all_documents
993
+ return unless @u_matrix
994
+ return reproject_all_documents_native if self.class.native_available? && @u_matrix.respond_to?(:batch_project)
995
+
996
+ reproject_all_documents_ruby
997
+ end
998
+
999
+ # Native batch re-projection using C extension.
1000
+ # @rbs () -> void
1001
+ def reproject_all_documents_native
1002
+ nodes = @items.values
1003
+ raw_vectors = nodes.map do |node|
1004
+ raw = node.raw_vector_with(@word_list)
1005
+ raw.is_a?(self.class.vector_class) ? raw : self.class.vector_class.alloc(raw.to_a)
1006
+ end
1007
+
1008
+ lsi_vectors = @u_matrix.batch_project(raw_vectors)
1009
+
1010
+ nodes.each_with_index do |node, i|
1011
+ lsi_vec = lsi_vectors[i].row
1012
+ node.lsi_vector = lsi_vec
1013
+ node.lsi_norm = lsi_vec.normalize
1014
+ end
1015
+ end
1016
+
1017
+ # Pure Ruby re-projection (fallback)
1018
+ # @rbs () -> void
1019
+ def reproject_all_documents_ruby
1020
+ @items.each_value do |node|
1021
+ assign_lsi_vector_incremental(node)
1022
+ end
1023
+ end
1024
+
1025
+ # Assigns LSI vector to a node using projection: lsi_vec = U^T * raw_vec.
1026
+ # @rbs (ContentNode) -> void
1027
+ def assign_lsi_vector_incremental(node)
1028
+ return unless @u_matrix
1029
+
1030
+ raw_vec = node.raw_vector_with(@word_list)
1031
+ raw_vector = Vector[*raw_vec.to_a]
1032
+ lsi_arr = (@u_matrix.transpose * raw_vector).to_a
1033
+
1034
+ lsi_vec = if self.class.native_available?
1035
+ self.class.vector_class.alloc(lsi_arr).row
1036
+ else
1037
+ Vector[*lsi_arr]
1038
+ end
1039
+ node.lsi_vector = lsi_vec
1040
+ node.lsi_norm = lsi_vec.normalize
1041
+ end
694
1042
  end
695
1043
  end
@@ -0,0 +1,99 @@
1
+ # rbs_inline: enabled
2
+
3
+ module Classifier
4
+ module Streaming
5
+ # Memory-efficient line reader for large files and IO streams.
6
+ # Reads lines one at a time and can yield in configurable batches.
7
+ #
8
+ # @example Reading line by line
9
+ # reader = LineReader.new(File.open('large_corpus.txt'))
10
+ # reader.each { |line| process(line) }
11
+ #
12
+ # @example Reading in batches
13
+ # reader = LineReader.new(io, batch_size: 100)
14
+ # reader.each_batch { |batch| process_batch(batch) }
15
+ class LineReader
16
+ include Enumerable #[String]
17
+
18
+ # @rbs @io: IO
19
+ # @rbs @batch_size: Integer
20
+
21
+ attr_reader :batch_size
22
+
23
+ # Creates a new LineReader.
24
+ #
25
+ # @rbs (IO, ?batch_size: Integer) -> void
26
+ def initialize(io, batch_size: 100)
27
+ @io = io
28
+ @batch_size = batch_size
29
+ end
30
+
31
+ # Iterates over each line in the IO stream.
32
+ # Lines are chomped (trailing newlines removed).
33
+ #
34
+ # @rbs () { (String) -> void } -> void
35
+ # @rbs () -> Enumerator[String, void]
36
+ def each
37
+ return enum_for(:each) unless block_given?
38
+
39
+ @io.each_line do |line|
40
+ yield line.chomp
41
+ end
42
+ end
43
+
44
+ # Iterates over batches of lines.
45
+ # Each batch is an array of chomped lines.
46
+ #
47
+ # @rbs () { (Array[String]) -> void } -> void
48
+ # @rbs () -> Enumerator[Array[String], void]
49
+ def each_batch
50
+ return enum_for(:each_batch) unless block_given?
51
+
52
+ batch = [] #: Array[String]
53
+ each do |line|
54
+ batch << line
55
+ if batch.size >= @batch_size
56
+ yield batch
57
+ batch = []
58
+ end
59
+ end
60
+ yield batch unless batch.empty?
61
+ end
62
+
63
+ # Estimates the total number of lines in the IO stream.
64
+ # This is a rough estimate based on file size and average line length.
65
+ # Returns nil for non-seekable streams.
66
+ #
67
+ # @rbs (?sample_size: Integer) -> Integer?
68
+ def estimate_line_count(sample_size: 100)
69
+ return nil unless @io.respond_to?(:size) && @io.respond_to?(:rewind)
70
+
71
+ begin
72
+ original_pos = @io.pos
73
+ @io.rewind
74
+
75
+ sample_bytes = 0
76
+ sample_lines = 0
77
+
78
+ sample_size.times do
79
+ line = @io.gets
80
+ break unless line
81
+
82
+ sample_bytes += line.bytesize
83
+ sample_lines += 1
84
+ end
85
+
86
+ @io.seek(original_pos)
87
+
88
+ return nil if sample_lines.zero?
89
+
90
+ avg_line_size = sample_bytes.to_f / sample_lines
91
+ io_size = @io.__send__(:size) #: Integer
92
+ (io_size / avg_line_size).round
93
+ rescue IOError, Errno::ESPIPE
94
+ nil
95
+ end
96
+ end
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,96 @@
1
+ # rbs_inline: enabled
2
+
3
+ module Classifier
4
+ module Streaming
5
+ # Progress tracking object yielded to blocks during batch/stream operations.
6
+ # Provides information about training progress including completion percentage,
7
+ # elapsed time, processing rate, and estimated time remaining.
8
+ #
9
+ # @example Basic usage with train_batch
10
+ # classifier.train_batch(:spam, documents, batch_size: 100) do |progress|
11
+ # puts "#{progress.completed}/#{progress.total} (#{progress.percent}%)"
12
+ # puts "Rate: #{progress.rate.round(1)} docs/sec"
13
+ # puts "ETA: #{progress.eta&.round}s" if progress.eta
14
+ # end
15
+ class Progress
16
+ # @rbs @completed: Integer
17
+ # @rbs @total: Integer?
18
+ # @rbs @start_time: Time
19
+ # @rbs @current_batch: Integer
20
+
21
+ attr_reader :start_time, :total
22
+ attr_accessor :completed, :current_batch
23
+
24
+ # @rbs (?total: Integer?, ?completed: Integer) -> void
25
+ def initialize(total: nil, completed: 0)
26
+ @completed = completed
27
+ @total = total
28
+ @start_time = Time.now
29
+ @current_batch = 0
30
+ end
31
+
32
+ # Returns the completion percentage (0-100).
33
+ # Returns nil if total is unknown.
34
+ #
35
+ # @rbs () -> Float?
36
+ def percent
37
+ return nil unless @total&.positive?
38
+
39
+ (@completed.to_f / @total * 100).round(2)
40
+ end
41
+
42
+ # Returns the elapsed time in seconds since the operation started.
43
+ #
44
+ # @rbs () -> Float
45
+ def elapsed
46
+ Time.now - @start_time
47
+ end
48
+
49
+ # Returns the processing rate in items per second.
50
+ # Returns 0 if no time has elapsed.
51
+ #
52
+ # @rbs () -> Float
53
+ def rate
54
+ e = elapsed
55
+ return 0.0 if e.zero?
56
+
57
+ @completed / e
58
+ end
59
+
60
+ # Returns the estimated time remaining in seconds.
61
+ # Returns nil if total is unknown or rate is zero.
62
+ #
63
+ # @rbs () -> Float?
64
+ def eta
65
+ return nil unless @total
66
+ return nil if rate.zero?
67
+ return 0.0 if @completed >= @total
68
+
69
+ (@total - @completed) / rate
70
+ end
71
+
72
+ # Returns true if the operation is complete.
73
+ #
74
+ # @rbs () -> bool
75
+ def complete?
76
+ return false unless @total
77
+
78
+ @completed >= @total
79
+ end
80
+
81
+ # Returns a hash representation of the progress state.
82
+ #
83
+ # @rbs () -> Hash[Symbol, untyped]
84
+ def to_h
85
+ {
86
+ completed: @completed,
87
+ total: @total,
88
+ percent: percent,
89
+ elapsed: elapsed.round(2),
90
+ rate: rate.round(2),
91
+ eta: eta&.round(2)
92
+ }
93
+ end
94
+ end
95
+ end
96
+ end