classifier 2.0.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,50 @@
1
+ # rbs_inline: enabled
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ module Classifier
8
+ module Storage
9
+ # Abstract base class for storage backends.
10
+ # Implement this protocol to create custom storage (Redis, PostgreSQL, etc.)
11
+ #
12
+ # Example:
13
+ # class RedisStorage < Classifier::Storage::Base
14
+ # def initialize(redis:, key:)
15
+ # @redis, @key = redis, key
16
+ # end
17
+ #
18
+ # def write(data) = @redis.set(@key, data)
19
+ # def read = @redis.get(@key)
20
+ # def delete = @redis.del(@key)
21
+ # def exists? = @redis.exists?(@key)
22
+ # end
23
+ #
24
+ class Base
25
+ # Save classifier data
26
+ # @rbs (String) -> void
27
+ def write(data)
28
+ raise NotImplementedError, "#{self.class}#write must be implemented"
29
+ end
30
+
31
+ # Load classifier data
32
+ # @rbs () -> String?
33
+ def read
34
+ raise NotImplementedError, "#{self.class}#read must be implemented"
35
+ end
36
+
37
+ # Delete classifier data
38
+ # @rbs () -> void
39
+ def delete
40
+ raise NotImplementedError, "#{self.class}#delete must be implemented"
41
+ end
42
+
43
+ # Check if data exists
44
+ # @rbs () -> bool
45
+ def exists?
46
+ raise NotImplementedError, "#{self.class}#exists? must be implemented"
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,51 @@
1
+ # rbs_inline: enabled
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ require_relative 'base'
8
+
9
+ module Classifier
10
+ module Storage
11
+ # File-based storage backend.
12
+ #
13
+ # Example:
14
+ # bayes = Classifier::Bayes.new('Spam', 'Ham')
15
+ # bayes.storage = Classifier::Storage::File.new(path: "/var/models/spam.json")
16
+ # bayes.train_spam("Buy now!")
17
+ # bayes.save
18
+ #
19
+ class File < Base
20
+ # @rbs @path: String
21
+
22
+ attr_reader :path
23
+
24
+ # @rbs (path: String) -> void
25
+ def initialize(path:)
26
+ super()
27
+ @path = path
28
+ end
29
+
30
+ # @rbs (String) -> Integer
31
+ def write(data)
32
+ ::File.write(@path, data)
33
+ end
34
+
35
+ # @rbs () -> String?
36
+ def read
37
+ exists? ? ::File.read(@path) : nil
38
+ end
39
+
40
+ # @rbs () -> void
41
+ def delete
42
+ ::File.delete(@path) if exists?
43
+ end
44
+
45
+ # @rbs () -> bool
46
+ def exists?
47
+ ::File.exist?(@path)
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,49 @@
1
+ # rbs_inline: enabled
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ require_relative 'base'
8
+
9
+ module Classifier
10
+ module Storage
11
+ # In-memory storage for testing and ephemeral use.
12
+ #
13
+ # Example:
14
+ # bayes = Classifier::Bayes.new('Spam', 'Ham')
15
+ # bayes.storage = Classifier::Storage::Memory.new
16
+ # bayes.train_spam("Buy now!")
17
+ # bayes.save
18
+ #
19
+ class Memory < Base
20
+ # @rbs @data: String?
21
+
22
+ # @rbs () -> void
23
+ def initialize
24
+ super
25
+ @data = nil
26
+ end
27
+
28
+ # @rbs (String) -> String
29
+ def write(data)
30
+ @data = data
31
+ end
32
+
33
+ # @rbs () -> String?
34
+ def read
35
+ @data
36
+ end
37
+
38
+ # @rbs () -> void
39
+ def delete
40
+ @data = nil
41
+ end
42
+
43
+ # @rbs () -> bool
44
+ def exists?
45
+ !@data.nil?
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,9 @@
1
+ # rbs_inline: enabled
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ require_relative 'storage/base'
8
+ require_relative 'storage/memory'
9
+ require_relative 'storage/file'
@@ -0,0 +1,99 @@
1
+ # rbs_inline: enabled
2
+
3
+ module Classifier
4
+ module Streaming
5
+ # Memory-efficient line reader for large files and IO streams.
6
+ # Reads lines one at a time and can yield in configurable batches.
7
+ #
8
+ # @example Reading line by line
9
+ # reader = LineReader.new(File.open('large_corpus.txt'))
10
+ # reader.each { |line| process(line) }
11
+ #
12
+ # @example Reading in batches
13
+ # reader = LineReader.new(io, batch_size: 100)
14
+ # reader.each_batch { |batch| process_batch(batch) }
15
+ class LineReader
16
+ include Enumerable #[String]
17
+
18
+ # @rbs @io: IO
19
+ # @rbs @batch_size: Integer
20
+
21
+ attr_reader :batch_size
22
+
23
+ # Creates a new LineReader.
24
+ #
25
+ # @rbs (IO, ?batch_size: Integer) -> void
26
+ def initialize(io, batch_size: 100)
27
+ @io = io
28
+ @batch_size = batch_size
29
+ end
30
+
31
+ # Iterates over each line in the IO stream.
32
+ # Lines are chomped (trailing newlines removed).
33
+ #
34
+ # @rbs () { (String) -> void } -> void
35
+ # @rbs () -> Enumerator[String, void]
36
+ def each
37
+ return enum_for(:each) unless block_given?
38
+
39
+ @io.each_line do |line|
40
+ yield line.chomp
41
+ end
42
+ end
43
+
44
+ # Iterates over batches of lines.
45
+ # Each batch is an array of chomped lines.
46
+ #
47
+ # @rbs () { (Array[String]) -> void } -> void
48
+ # @rbs () -> Enumerator[Array[String], void]
49
+ def each_batch
50
+ return enum_for(:each_batch) unless block_given?
51
+
52
+ batch = [] #: Array[String]
53
+ each do |line|
54
+ batch << line
55
+ if batch.size >= @batch_size
56
+ yield batch
57
+ batch = []
58
+ end
59
+ end
60
+ yield batch unless batch.empty?
61
+ end
62
+
63
+ # Estimates the total number of lines in the IO stream.
64
+ # This is a rough estimate based on file size and average line length.
65
+ # Returns nil for non-seekable streams.
66
+ #
67
+ # @rbs (?sample_size: Integer) -> Integer?
68
+ def estimate_line_count(sample_size: 100)
69
+ return nil unless @io.respond_to?(:size) && @io.respond_to?(:rewind)
70
+
71
+ begin
72
+ original_pos = @io.pos
73
+ @io.rewind
74
+
75
+ sample_bytes = 0
76
+ sample_lines = 0
77
+
78
+ sample_size.times do
79
+ line = @io.gets
80
+ break unless line
81
+
82
+ sample_bytes += line.bytesize
83
+ sample_lines += 1
84
+ end
85
+
86
+ @io.seek(original_pos)
87
+
88
+ return nil if sample_lines.zero?
89
+
90
+ avg_line_size = sample_bytes.to_f / sample_lines
91
+ io_size = @io.__send__(:size) #: Integer
92
+ (io_size / avg_line_size).round
93
+ rescue IOError, Errno::ESPIPE
94
+ nil
95
+ end
96
+ end
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,96 @@
1
+ # rbs_inline: enabled
2
+
3
+ module Classifier
4
+ module Streaming
5
+ # Progress tracking object yielded to blocks during batch/stream operations.
6
+ # Provides information about training progress including completion percentage,
7
+ # elapsed time, processing rate, and estimated time remaining.
8
+ #
9
+ # @example Basic usage with train_batch
10
+ # classifier.train_batch(:spam, documents, batch_size: 100) do |progress|
11
+ # puts "#{progress.completed}/#{progress.total} (#{progress.percent}%)"
12
+ # puts "Rate: #{progress.rate.round(1)} docs/sec"
13
+ # puts "ETA: #{progress.eta&.round}s" if progress.eta
14
+ # end
15
+ class Progress
16
+ # @rbs @completed: Integer
17
+ # @rbs @total: Integer?
18
+ # @rbs @start_time: Time
19
+ # @rbs @current_batch: Integer
20
+
21
+ attr_reader :start_time, :total
22
+ attr_accessor :completed, :current_batch
23
+
24
+ # @rbs (?total: Integer?, ?completed: Integer) -> void
25
+ def initialize(total: nil, completed: 0)
26
+ @completed = completed
27
+ @total = total
28
+ @start_time = Time.now
29
+ @current_batch = 0
30
+ end
31
+
32
+ # Returns the completion percentage (0-100).
33
+ # Returns nil if total is unknown.
34
+ #
35
+ # @rbs () -> Float?
36
+ def percent
37
+ return nil unless @total&.positive?
38
+
39
+ (@completed.to_f / @total * 100).round(2)
40
+ end
41
+
42
+ # Returns the elapsed time in seconds since the operation started.
43
+ #
44
+ # @rbs () -> Float
45
+ def elapsed
46
+ Time.now - @start_time
47
+ end
48
+
49
+ # Returns the processing rate in items per second.
50
+ # Returns 0 if no time has elapsed.
51
+ #
52
+ # @rbs () -> Float
53
+ def rate
54
+ e = elapsed
55
+ return 0.0 if e.zero?
56
+
57
+ @completed / e
58
+ end
59
+
60
+ # Returns the estimated time remaining in seconds.
61
+ # Returns nil if total is unknown or rate is zero.
62
+ #
63
+ # @rbs () -> Float?
64
+ def eta
65
+ return nil unless @total
66
+ return nil if rate.zero?
67
+ return 0.0 if @completed >= @total
68
+
69
+ (@total - @completed) / rate
70
+ end
71
+
72
+ # Returns true if the operation is complete.
73
+ #
74
+ # @rbs () -> bool
75
+ def complete?
76
+ return false unless @total
77
+
78
+ @completed >= @total
79
+ end
80
+
81
+ # Returns a hash representation of the progress state.
82
+ #
83
+ # @rbs () -> Hash[Symbol, untyped]
84
+ def to_h
85
+ {
86
+ completed: @completed,
87
+ total: @total,
88
+ percent: percent,
89
+ elapsed: elapsed.round(2),
90
+ rate: rate.round(2),
91
+ eta: eta&.round(2)
92
+ }
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,122 @@
1
+ # rbs_inline: enabled
2
+
3
+ require_relative 'streaming/progress'
4
+ require_relative 'streaming/line_reader'
5
+
6
+ module Classifier
7
+ # Streaming module provides memory-efficient training capabilities for classifiers.
8
+ # Include this module in a classifier to add streaming and batch training methods.
9
+ #
10
+ # @example Including in a classifier
11
+ # class MyClassifier
12
+ # include Classifier::Streaming
13
+ # end
14
+ #
15
+ # @example Streaming training
16
+ # classifier.train_from_stream(:category, File.open('corpus.txt'))
17
+ #
18
+ # @example Batch training with progress
19
+ # classifier.train_batch(:category, documents, batch_size: 100) do |progress|
20
+ # puts "#{progress.percent}% complete"
21
+ # end
22
+ module Streaming
23
+ # Default batch size for streaming operations
24
+ DEFAULT_BATCH_SIZE = 100
25
+
26
+ # Trains the classifier from an IO stream.
27
+ # Each line in the stream is treated as a separate document.
28
+ #
29
+ # @rbs (Symbol | String, IO, ?batch_size: Integer) { (Progress) -> void } -> void
30
+ def train_from_stream(category, io, batch_size: DEFAULT_BATCH_SIZE, &block)
31
+ raise NotImplementedError, "#{self.class} must implement train_from_stream"
32
+ end
33
+
34
+ # Trains the classifier with an array of documents in batches.
35
+ # Supports both positional and keyword argument styles.
36
+ #
37
+ # @example Positional style
38
+ # classifier.train_batch(:spam, documents, batch_size: 100)
39
+ #
40
+ # @example Keyword style
41
+ # classifier.train_batch(spam: documents, ham: other_docs, batch_size: 100)
42
+ #
43
+ # @rbs (?(Symbol | String)?, ?Array[String]?, ?batch_size: Integer, **Array[String]) { (Progress) -> void } -> void
44
+ def train_batch(category = nil, documents = nil, batch_size: DEFAULT_BATCH_SIZE, **categories, &block)
45
+ raise NotImplementedError, "#{self.class} must implement train_batch"
46
+ end
47
+
48
+ # Saves a checkpoint of the current training state.
49
+ # Requires a storage backend to be configured.
50
+ #
51
+ # @rbs (String) -> void
52
+ def save_checkpoint(checkpoint_id)
53
+ raise ArgumentError, 'No storage configured' unless respond_to?(:storage) && storage
54
+
55
+ original_storage = storage
56
+
57
+ begin
58
+ self.storage = checkpoint_storage_for(checkpoint_id)
59
+ save
60
+ ensure
61
+ self.storage = original_storage
62
+ end
63
+ end
64
+
65
+ # Lists available checkpoints.
66
+ # Requires a storage backend to be configured.
67
+ #
68
+ # @rbs () -> Array[String]
69
+ def list_checkpoints
70
+ raise ArgumentError, 'No storage configured' unless respond_to?(:storage) && storage
71
+
72
+ case storage
73
+ when Storage::File
74
+ file_storage = storage #: Storage::File
75
+ dir = File.dirname(file_storage.path)
76
+ base = File.basename(file_storage.path, '.*')
77
+ ext = File.extname(file_storage.path)
78
+
79
+ pattern = File.join(dir, "#{base}_checkpoint_*#{ext}")
80
+ Dir.glob(pattern).map do |path|
81
+ File.basename(path, ext).sub(/^#{Regexp.escape(base)}_checkpoint_/, '')
82
+ end.sort
83
+ else
84
+ []
85
+ end
86
+ end
87
+
88
+ # Deletes a checkpoint.
89
+ #
90
+ # @rbs (String) -> void
91
+ def delete_checkpoint(checkpoint_id)
92
+ raise ArgumentError, 'No storage configured' unless respond_to?(:storage) && storage
93
+
94
+ checkpoint_storage = checkpoint_storage_for(checkpoint_id)
95
+ checkpoint_storage.delete if checkpoint_storage.exists?
96
+ end
97
+
98
+ private
99
+
100
+ # @rbs (String) -> String
101
+ def checkpoint_path_for(checkpoint_id)
102
+ raise ArgumentError, 'Storage must be File storage for checkpoints' unless storage.is_a?(Storage::File)
103
+
104
+ file_storage = storage #: Storage::File
105
+ dir = File.dirname(file_storage.path)
106
+ base = File.basename(file_storage.path, '.*')
107
+ ext = File.extname(file_storage.path)
108
+
109
+ File.join(dir, "#{base}_checkpoint_#{checkpoint_id}#{ext}")
110
+ end
111
+
112
+ # @rbs (String) -> Storage::Base
113
+ def checkpoint_storage_for(checkpoint_id)
114
+ case storage
115
+ when Storage::File
116
+ Storage::File.new(path: checkpoint_path_for(checkpoint_id))
117
+ else
118
+ raise ArgumentError, "Checkpoints not supported for #{storage.class}"
119
+ end
120
+ end
121
+ end
122
+ end