classifier 2.0.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +23 -13
- data/README.md +72 -190
- data/ext/classifier/classifier_ext.c +26 -0
- data/ext/classifier/extconf.rb +15 -0
- data/ext/classifier/incremental_svd.c +393 -0
- data/ext/classifier/linalg.h +72 -0
- data/ext/classifier/matrix.c +387 -0
- data/ext/classifier/svd.c +208 -0
- data/ext/classifier/vector.c +319 -0
- data/lib/classifier/bayes.rb +398 -54
- data/lib/classifier/errors.rb +19 -0
- data/lib/classifier/extensions/vector.rb +12 -4
- data/lib/classifier/knn.rb +351 -0
- data/lib/classifier/logistic_regression.rb +571 -0
- data/lib/classifier/lsi/content_node.rb +5 -5
- data/lib/classifier/lsi/incremental_svd.rb +166 -0
- data/lib/classifier/lsi/summary.rb +25 -5
- data/lib/classifier/lsi.rb +784 -138
- data/lib/classifier/storage/base.rb +50 -0
- data/lib/classifier/storage/file.rb +51 -0
- data/lib/classifier/storage/memory.rb +49 -0
- data/lib/classifier/storage.rb +9 -0
- data/lib/classifier/streaming/line_reader.rb +99 -0
- data/lib/classifier/streaming/progress.rb +96 -0
- data/lib/classifier/streaming.rb +122 -0
- data/lib/classifier/tfidf.rb +408 -0
- data/lib/classifier.rb +6 -0
- data/sig/vendor/json.rbs +4 -0
- data/sig/vendor/matrix.rbs +25 -14
- data/sig/vendor/mutex_m.rbs +16 -0
- data/sig/vendor/streaming.rbs +14 -0
- data/test/test_helper.rb +2 -0
- metadata +52 -8
- data/lib/classifier/extensions/vector_serialize.rb +0 -18
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
|
5
|
+
# License:: LGPL
|
|
6
|
+
|
|
7
|
+
module Classifier
|
|
8
|
+
module Storage
|
|
9
|
+
# Abstract base class for storage backends.
|
|
10
|
+
# Implement this protocol to create custom storage (Redis, PostgreSQL, etc.)
|
|
11
|
+
#
|
|
12
|
+
# Example:
|
|
13
|
+
# class RedisStorage < Classifier::Storage::Base
|
|
14
|
+
# def initialize(redis:, key:)
|
|
15
|
+
# @redis, @key = redis, key
|
|
16
|
+
# end
|
|
17
|
+
#
|
|
18
|
+
# def write(data) = @redis.set(@key, data)
|
|
19
|
+
# def read = @redis.get(@key)
|
|
20
|
+
# def delete = @redis.del(@key)
|
|
21
|
+
# def exists? = @redis.exists?(@key)
|
|
22
|
+
# end
|
|
23
|
+
#
|
|
24
|
+
class Base
|
|
25
|
+
# Save classifier data
|
|
26
|
+
# @rbs (String) -> void
|
|
27
|
+
def write(data)
|
|
28
|
+
raise NotImplementedError, "#{self.class}#write must be implemented"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Load classifier data
|
|
32
|
+
# @rbs () -> String?
|
|
33
|
+
def read
|
|
34
|
+
raise NotImplementedError, "#{self.class}#read must be implemented"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Delete classifier data
|
|
38
|
+
# @rbs () -> void
|
|
39
|
+
def delete
|
|
40
|
+
raise NotImplementedError, "#{self.class}#delete must be implemented"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Check if data exists
|
|
44
|
+
# @rbs () -> bool
|
|
45
|
+
def exists?
|
|
46
|
+
raise NotImplementedError, "#{self.class}#exists? must be implemented"
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
|
5
|
+
# License:: LGPL
|
|
6
|
+
|
|
7
|
+
require_relative 'base'
|
|
8
|
+
|
|
9
|
+
module Classifier
|
|
10
|
+
module Storage
|
|
11
|
+
# File-based storage backend.
|
|
12
|
+
#
|
|
13
|
+
# Example:
|
|
14
|
+
# bayes = Classifier::Bayes.new('Spam', 'Ham')
|
|
15
|
+
# bayes.storage = Classifier::Storage::File.new(path: "/var/models/spam.json")
|
|
16
|
+
# bayes.train_spam("Buy now!")
|
|
17
|
+
# bayes.save
|
|
18
|
+
#
|
|
19
|
+
class File < Base
|
|
20
|
+
# @rbs @path: String
|
|
21
|
+
|
|
22
|
+
attr_reader :path
|
|
23
|
+
|
|
24
|
+
# @rbs (path: String) -> void
|
|
25
|
+
def initialize(path:)
|
|
26
|
+
super()
|
|
27
|
+
@path = path
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# @rbs (String) -> Integer
|
|
31
|
+
def write(data)
|
|
32
|
+
::File.write(@path, data)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# @rbs () -> String?
|
|
36
|
+
def read
|
|
37
|
+
exists? ? ::File.read(@path) : nil
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# @rbs () -> void
|
|
41
|
+
def delete
|
|
42
|
+
::File.delete(@path) if exists?
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# @rbs () -> bool
|
|
46
|
+
def exists?
|
|
47
|
+
::File.exist?(@path)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
|
5
|
+
# License:: LGPL
|
|
6
|
+
|
|
7
|
+
require_relative 'base'
|
|
8
|
+
|
|
9
|
+
module Classifier
|
|
10
|
+
module Storage
|
|
11
|
+
# In-memory storage for testing and ephemeral use.
|
|
12
|
+
#
|
|
13
|
+
# Example:
|
|
14
|
+
# bayes = Classifier::Bayes.new('Spam', 'Ham')
|
|
15
|
+
# bayes.storage = Classifier::Storage::Memory.new
|
|
16
|
+
# bayes.train_spam("Buy now!")
|
|
17
|
+
# bayes.save
|
|
18
|
+
#
|
|
19
|
+
class Memory < Base
|
|
20
|
+
# @rbs @data: String?
|
|
21
|
+
|
|
22
|
+
# @rbs () -> void
|
|
23
|
+
def initialize
|
|
24
|
+
super
|
|
25
|
+
@data = nil
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# @rbs (String) -> String
|
|
29
|
+
def write(data)
|
|
30
|
+
@data = data
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# @rbs () -> String?
|
|
34
|
+
def read
|
|
35
|
+
@data
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# @rbs () -> void
|
|
39
|
+
def delete
|
|
40
|
+
@data = nil
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# @rbs () -> bool
|
|
44
|
+
def exists?
|
|
45
|
+
!@data.nil?
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
module Classifier
|
|
4
|
+
module Streaming
|
|
5
|
+
# Memory-efficient line reader for large files and IO streams.
|
|
6
|
+
# Reads lines one at a time and can yield in configurable batches.
|
|
7
|
+
#
|
|
8
|
+
# @example Reading line by line
|
|
9
|
+
# reader = LineReader.new(File.open('large_corpus.txt'))
|
|
10
|
+
# reader.each { |line| process(line) }
|
|
11
|
+
#
|
|
12
|
+
# @example Reading in batches
|
|
13
|
+
# reader = LineReader.new(io, batch_size: 100)
|
|
14
|
+
# reader.each_batch { |batch| process_batch(batch) }
|
|
15
|
+
class LineReader
|
|
16
|
+
include Enumerable #[String]
|
|
17
|
+
|
|
18
|
+
# @rbs @io: IO
|
|
19
|
+
# @rbs @batch_size: Integer
|
|
20
|
+
|
|
21
|
+
attr_reader :batch_size
|
|
22
|
+
|
|
23
|
+
# Creates a new LineReader.
|
|
24
|
+
#
|
|
25
|
+
# @rbs (IO, ?batch_size: Integer) -> void
|
|
26
|
+
def initialize(io, batch_size: 100)
|
|
27
|
+
@io = io
|
|
28
|
+
@batch_size = batch_size
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Iterates over each line in the IO stream.
|
|
32
|
+
# Lines are chomped (trailing newlines removed).
|
|
33
|
+
#
|
|
34
|
+
# @rbs () { (String) -> void } -> void
|
|
35
|
+
# @rbs () -> Enumerator[String, void]
|
|
36
|
+
def each
|
|
37
|
+
return enum_for(:each) unless block_given?
|
|
38
|
+
|
|
39
|
+
@io.each_line do |line|
|
|
40
|
+
yield line.chomp
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Iterates over batches of lines.
|
|
45
|
+
# Each batch is an array of chomped lines.
|
|
46
|
+
#
|
|
47
|
+
# @rbs () { (Array[String]) -> void } -> void
|
|
48
|
+
# @rbs () -> Enumerator[Array[String], void]
|
|
49
|
+
def each_batch
|
|
50
|
+
return enum_for(:each_batch) unless block_given?
|
|
51
|
+
|
|
52
|
+
batch = [] #: Array[String]
|
|
53
|
+
each do |line|
|
|
54
|
+
batch << line
|
|
55
|
+
if batch.size >= @batch_size
|
|
56
|
+
yield batch
|
|
57
|
+
batch = []
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
yield batch unless batch.empty?
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Estimates the total number of lines in the IO stream.
|
|
64
|
+
# This is a rough estimate based on file size and average line length.
|
|
65
|
+
# Returns nil for non-seekable streams.
|
|
66
|
+
#
|
|
67
|
+
# @rbs (?sample_size: Integer) -> Integer?
|
|
68
|
+
def estimate_line_count(sample_size: 100)
|
|
69
|
+
return nil unless @io.respond_to?(:size) && @io.respond_to?(:rewind)
|
|
70
|
+
|
|
71
|
+
begin
|
|
72
|
+
original_pos = @io.pos
|
|
73
|
+
@io.rewind
|
|
74
|
+
|
|
75
|
+
sample_bytes = 0
|
|
76
|
+
sample_lines = 0
|
|
77
|
+
|
|
78
|
+
sample_size.times do
|
|
79
|
+
line = @io.gets
|
|
80
|
+
break unless line
|
|
81
|
+
|
|
82
|
+
sample_bytes += line.bytesize
|
|
83
|
+
sample_lines += 1
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
@io.seek(original_pos)
|
|
87
|
+
|
|
88
|
+
return nil if sample_lines.zero?
|
|
89
|
+
|
|
90
|
+
avg_line_size = sample_bytes.to_f / sample_lines
|
|
91
|
+
io_size = @io.__send__(:size) #: Integer
|
|
92
|
+
(io_size / avg_line_size).round
|
|
93
|
+
rescue IOError, Errno::ESPIPE
|
|
94
|
+
nil
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
module Classifier
|
|
4
|
+
module Streaming
|
|
5
|
+
# Progress tracking object yielded to blocks during batch/stream operations.
|
|
6
|
+
# Provides information about training progress including completion percentage,
|
|
7
|
+
# elapsed time, processing rate, and estimated time remaining.
|
|
8
|
+
#
|
|
9
|
+
# @example Basic usage with train_batch
|
|
10
|
+
# classifier.train_batch(:spam, documents, batch_size: 100) do |progress|
|
|
11
|
+
# puts "#{progress.completed}/#{progress.total} (#{progress.percent}%)"
|
|
12
|
+
# puts "Rate: #{progress.rate.round(1)} docs/sec"
|
|
13
|
+
# puts "ETA: #{progress.eta&.round}s" if progress.eta
|
|
14
|
+
# end
|
|
15
|
+
class Progress
|
|
16
|
+
# @rbs @completed: Integer
|
|
17
|
+
# @rbs @total: Integer?
|
|
18
|
+
# @rbs @start_time: Time
|
|
19
|
+
# @rbs @current_batch: Integer
|
|
20
|
+
|
|
21
|
+
attr_reader :start_time, :total
|
|
22
|
+
attr_accessor :completed, :current_batch
|
|
23
|
+
|
|
24
|
+
# @rbs (?total: Integer?, ?completed: Integer) -> void
|
|
25
|
+
def initialize(total: nil, completed: 0)
|
|
26
|
+
@completed = completed
|
|
27
|
+
@total = total
|
|
28
|
+
@start_time = Time.now
|
|
29
|
+
@current_batch = 0
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Returns the completion percentage (0-100).
|
|
33
|
+
# Returns nil if total is unknown.
|
|
34
|
+
#
|
|
35
|
+
# @rbs () -> Float?
|
|
36
|
+
def percent
|
|
37
|
+
return nil unless @total&.positive?
|
|
38
|
+
|
|
39
|
+
(@completed.to_f / @total * 100).round(2)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Returns the elapsed time in seconds since the operation started.
|
|
43
|
+
#
|
|
44
|
+
# @rbs () -> Float
|
|
45
|
+
def elapsed
|
|
46
|
+
Time.now - @start_time
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Returns the processing rate in items per second.
|
|
50
|
+
# Returns 0 if no time has elapsed.
|
|
51
|
+
#
|
|
52
|
+
# @rbs () -> Float
|
|
53
|
+
def rate
|
|
54
|
+
e = elapsed
|
|
55
|
+
return 0.0 if e.zero?
|
|
56
|
+
|
|
57
|
+
@completed / e
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Returns the estimated time remaining in seconds.
|
|
61
|
+
# Returns nil if total is unknown or rate is zero.
|
|
62
|
+
#
|
|
63
|
+
# @rbs () -> Float?
|
|
64
|
+
def eta
|
|
65
|
+
return nil unless @total
|
|
66
|
+
return nil if rate.zero?
|
|
67
|
+
return 0.0 if @completed >= @total
|
|
68
|
+
|
|
69
|
+
(@total - @completed) / rate
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Returns true if the operation is complete.
|
|
73
|
+
#
|
|
74
|
+
# @rbs () -> bool
|
|
75
|
+
def complete?
|
|
76
|
+
return false unless @total
|
|
77
|
+
|
|
78
|
+
@completed >= @total
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Returns a hash representation of the progress state.
|
|
82
|
+
#
|
|
83
|
+
# @rbs () -> Hash[Symbol, untyped]
|
|
84
|
+
def to_h
|
|
85
|
+
{
|
|
86
|
+
completed: @completed,
|
|
87
|
+
total: @total,
|
|
88
|
+
percent: percent,
|
|
89
|
+
elapsed: elapsed.round(2),
|
|
90
|
+
rate: rate.round(2),
|
|
91
|
+
eta: eta&.round(2)
|
|
92
|
+
}
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
require_relative 'streaming/progress'
|
|
4
|
+
require_relative 'streaming/line_reader'
|
|
5
|
+
|
|
6
|
+
module Classifier
|
|
7
|
+
# Streaming module provides memory-efficient training capabilities for classifiers.
|
|
8
|
+
# Include this module in a classifier to add streaming and batch training methods.
|
|
9
|
+
#
|
|
10
|
+
# @example Including in a classifier
|
|
11
|
+
# class MyClassifier
|
|
12
|
+
# include Classifier::Streaming
|
|
13
|
+
# end
|
|
14
|
+
#
|
|
15
|
+
# @example Streaming training
|
|
16
|
+
# classifier.train_from_stream(:category, File.open('corpus.txt'))
|
|
17
|
+
#
|
|
18
|
+
# @example Batch training with progress
|
|
19
|
+
# classifier.train_batch(:category, documents, batch_size: 100) do |progress|
|
|
20
|
+
# puts "#{progress.percent}% complete"
|
|
21
|
+
# end
|
|
22
|
+
module Streaming
|
|
23
|
+
# Default batch size for streaming operations
|
|
24
|
+
DEFAULT_BATCH_SIZE = 100
|
|
25
|
+
|
|
26
|
+
# Trains the classifier from an IO stream.
|
|
27
|
+
# Each line in the stream is treated as a separate document.
|
|
28
|
+
#
|
|
29
|
+
# @rbs (Symbol | String, IO, ?batch_size: Integer) { (Progress) -> void } -> void
|
|
30
|
+
def train_from_stream(category, io, batch_size: DEFAULT_BATCH_SIZE, &block)
|
|
31
|
+
raise NotImplementedError, "#{self.class} must implement train_from_stream"
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Trains the classifier with an array of documents in batches.
|
|
35
|
+
# Supports both positional and keyword argument styles.
|
|
36
|
+
#
|
|
37
|
+
# @example Positional style
|
|
38
|
+
# classifier.train_batch(:spam, documents, batch_size: 100)
|
|
39
|
+
#
|
|
40
|
+
# @example Keyword style
|
|
41
|
+
# classifier.train_batch(spam: documents, ham: other_docs, batch_size: 100)
|
|
42
|
+
#
|
|
43
|
+
# @rbs (?(Symbol | String)?, ?Array[String]?, ?batch_size: Integer, **Array[String]) { (Progress) -> void } -> void
|
|
44
|
+
def train_batch(category = nil, documents = nil, batch_size: DEFAULT_BATCH_SIZE, **categories, &block)
|
|
45
|
+
raise NotImplementedError, "#{self.class} must implement train_batch"
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Saves a checkpoint of the current training state.
|
|
49
|
+
# Requires a storage backend to be configured.
|
|
50
|
+
#
|
|
51
|
+
# @rbs (String) -> void
|
|
52
|
+
def save_checkpoint(checkpoint_id)
|
|
53
|
+
raise ArgumentError, 'No storage configured' unless respond_to?(:storage) && storage
|
|
54
|
+
|
|
55
|
+
original_storage = storage
|
|
56
|
+
|
|
57
|
+
begin
|
|
58
|
+
self.storage = checkpoint_storage_for(checkpoint_id)
|
|
59
|
+
save
|
|
60
|
+
ensure
|
|
61
|
+
self.storage = original_storage
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Lists available checkpoints.
|
|
66
|
+
# Requires a storage backend to be configured.
|
|
67
|
+
#
|
|
68
|
+
# @rbs () -> Array[String]
|
|
69
|
+
def list_checkpoints
|
|
70
|
+
raise ArgumentError, 'No storage configured' unless respond_to?(:storage) && storage
|
|
71
|
+
|
|
72
|
+
case storage
|
|
73
|
+
when Storage::File
|
|
74
|
+
file_storage = storage #: Storage::File
|
|
75
|
+
dir = File.dirname(file_storage.path)
|
|
76
|
+
base = File.basename(file_storage.path, '.*')
|
|
77
|
+
ext = File.extname(file_storage.path)
|
|
78
|
+
|
|
79
|
+
pattern = File.join(dir, "#{base}_checkpoint_*#{ext}")
|
|
80
|
+
Dir.glob(pattern).map do |path|
|
|
81
|
+
File.basename(path, ext).sub(/^#{Regexp.escape(base)}_checkpoint_/, '')
|
|
82
|
+
end.sort
|
|
83
|
+
else
|
|
84
|
+
[]
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Deletes a checkpoint.
|
|
89
|
+
#
|
|
90
|
+
# @rbs (String) -> void
|
|
91
|
+
def delete_checkpoint(checkpoint_id)
|
|
92
|
+
raise ArgumentError, 'No storage configured' unless respond_to?(:storage) && storage
|
|
93
|
+
|
|
94
|
+
checkpoint_storage = checkpoint_storage_for(checkpoint_id)
|
|
95
|
+
checkpoint_storage.delete if checkpoint_storage.exists?
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
private
|
|
99
|
+
|
|
100
|
+
# @rbs (String) -> String
|
|
101
|
+
def checkpoint_path_for(checkpoint_id)
|
|
102
|
+
raise ArgumentError, 'Storage must be File storage for checkpoints' unless storage.is_a?(Storage::File)
|
|
103
|
+
|
|
104
|
+
file_storage = storage #: Storage::File
|
|
105
|
+
dir = File.dirname(file_storage.path)
|
|
106
|
+
base = File.basename(file_storage.path, '.*')
|
|
107
|
+
ext = File.extname(file_storage.path)
|
|
108
|
+
|
|
109
|
+
File.join(dir, "#{base}_checkpoint_#{checkpoint_id}#{ext}")
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# @rbs (String) -> Storage::Base
|
|
113
|
+
def checkpoint_storage_for(checkpoint_id)
|
|
114
|
+
case storage
|
|
115
|
+
when Storage::File
|
|
116
|
+
Storage::File.new(path: checkpoint_path_for(checkpoint_id))
|
|
117
|
+
else
|
|
118
|
+
raise ArgumentError, "Checkpoints not supported for #{storage.class}"
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|