classifier 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +66 -199
- data/ext/classifier/classifier_ext.c +1 -0
- data/ext/classifier/incremental_svd.c +393 -0
- data/ext/classifier/linalg.h +8 -0
- data/lib/classifier/bayes.rb +177 -53
- data/lib/classifier/errors.rb +3 -0
- data/lib/classifier/knn.rb +351 -0
- data/lib/classifier/logistic_regression.rb +571 -0
- data/lib/classifier/lsi/incremental_svd.rb +166 -0
- data/lib/classifier/lsi/summary.rb +25 -5
- data/lib/classifier/lsi.rb +365 -17
- data/lib/classifier/streaming/line_reader.rb +99 -0
- data/lib/classifier/streaming/progress.rb +96 -0
- data/lib/classifier/streaming.rb +122 -0
- data/lib/classifier/tfidf.rb +408 -0
- data/lib/classifier.rb +4 -0
- data/sig/vendor/matrix.rbs +25 -14
- data/sig/vendor/streaming.rbs +14 -0
- metadata +17 -4
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
require_relative 'streaming/progress'
|
|
4
|
+
require_relative 'streaming/line_reader'
|
|
5
|
+
|
|
6
|
+
module Classifier
|
|
7
|
+
# Streaming module provides memory-efficient training capabilities for classifiers.
|
|
8
|
+
# Include this module in a classifier to add streaming and batch training methods.
|
|
9
|
+
#
|
|
10
|
+
# @example Including in a classifier
|
|
11
|
+
# class MyClassifier
|
|
12
|
+
# include Classifier::Streaming
|
|
13
|
+
# end
|
|
14
|
+
#
|
|
15
|
+
# @example Streaming training
|
|
16
|
+
# classifier.train_from_stream(:category, File.open('corpus.txt'))
|
|
17
|
+
#
|
|
18
|
+
# @example Batch training with progress
|
|
19
|
+
# classifier.train_batch(:category, documents, batch_size: 100) do |progress|
|
|
20
|
+
# puts "#{progress.percent}% complete"
|
|
21
|
+
# end
|
|
22
|
+
module Streaming
|
|
23
|
+
# Default batch size for streaming operations
|
|
24
|
+
DEFAULT_BATCH_SIZE = 100
|
|
25
|
+
|
|
26
|
+
# Trains the classifier from an IO stream.
|
|
27
|
+
# Each line in the stream is treated as a separate document.
|
|
28
|
+
#
|
|
29
|
+
# @rbs (Symbol | String, IO, ?batch_size: Integer) { (Progress) -> void } -> void
|
|
30
|
+
def train_from_stream(category, io, batch_size: DEFAULT_BATCH_SIZE, &block)
|
|
31
|
+
raise NotImplementedError, "#{self.class} must implement train_from_stream"
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Trains the classifier with an array of documents in batches.
|
|
35
|
+
# Supports both positional and keyword argument styles.
|
|
36
|
+
#
|
|
37
|
+
# @example Positional style
|
|
38
|
+
# classifier.train_batch(:spam, documents, batch_size: 100)
|
|
39
|
+
#
|
|
40
|
+
# @example Keyword style
|
|
41
|
+
# classifier.train_batch(spam: documents, ham: other_docs, batch_size: 100)
|
|
42
|
+
#
|
|
43
|
+
# @rbs (?(Symbol | String)?, ?Array[String]?, ?batch_size: Integer, **Array[String]) { (Progress) -> void } -> void
|
|
44
|
+
def train_batch(category = nil, documents = nil, batch_size: DEFAULT_BATCH_SIZE, **categories, &block)
|
|
45
|
+
raise NotImplementedError, "#{self.class} must implement train_batch"
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Saves a checkpoint of the current training state.
|
|
49
|
+
# Requires a storage backend to be configured.
|
|
50
|
+
#
|
|
51
|
+
# @rbs (String) -> void
|
|
52
|
+
def save_checkpoint(checkpoint_id)
|
|
53
|
+
raise ArgumentError, 'No storage configured' unless respond_to?(:storage) && storage
|
|
54
|
+
|
|
55
|
+
original_storage = storage
|
|
56
|
+
|
|
57
|
+
begin
|
|
58
|
+
self.storage = checkpoint_storage_for(checkpoint_id)
|
|
59
|
+
save
|
|
60
|
+
ensure
|
|
61
|
+
self.storage = original_storage
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Lists available checkpoints.
|
|
66
|
+
# Requires a storage backend to be configured.
|
|
67
|
+
#
|
|
68
|
+
# @rbs () -> Array[String]
|
|
69
|
+
def list_checkpoints
|
|
70
|
+
raise ArgumentError, 'No storage configured' unless respond_to?(:storage) && storage
|
|
71
|
+
|
|
72
|
+
case storage
|
|
73
|
+
when Storage::File
|
|
74
|
+
file_storage = storage #: Storage::File
|
|
75
|
+
dir = File.dirname(file_storage.path)
|
|
76
|
+
base = File.basename(file_storage.path, '.*')
|
|
77
|
+
ext = File.extname(file_storage.path)
|
|
78
|
+
|
|
79
|
+
pattern = File.join(dir, "#{base}_checkpoint_*#{ext}")
|
|
80
|
+
Dir.glob(pattern).map do |path|
|
|
81
|
+
File.basename(path, ext).sub(/^#{Regexp.escape(base)}_checkpoint_/, '')
|
|
82
|
+
end.sort
|
|
83
|
+
else
|
|
84
|
+
[]
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Deletes a checkpoint.
|
|
89
|
+
#
|
|
90
|
+
# @rbs (String) -> void
|
|
91
|
+
def delete_checkpoint(checkpoint_id)
|
|
92
|
+
raise ArgumentError, 'No storage configured' unless respond_to?(:storage) && storage
|
|
93
|
+
|
|
94
|
+
checkpoint_storage = checkpoint_storage_for(checkpoint_id)
|
|
95
|
+
checkpoint_storage.delete if checkpoint_storage.exists?
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
private
|
|
99
|
+
|
|
100
|
+
# @rbs (String) -> String
|
|
101
|
+
def checkpoint_path_for(checkpoint_id)
|
|
102
|
+
raise ArgumentError, 'Storage must be File storage for checkpoints' unless storage.is_a?(Storage::File)
|
|
103
|
+
|
|
104
|
+
file_storage = storage #: Storage::File
|
|
105
|
+
dir = File.dirname(file_storage.path)
|
|
106
|
+
base = File.basename(file_storage.path, '.*')
|
|
107
|
+
ext = File.extname(file_storage.path)
|
|
108
|
+
|
|
109
|
+
File.join(dir, "#{base}_checkpoint_#{checkpoint_id}#{ext}")
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# @rbs (String) -> Storage::Base
|
|
113
|
+
def checkpoint_storage_for(checkpoint_id)
|
|
114
|
+
case storage
|
|
115
|
+
when Storage::File
|
|
116
|
+
Storage::File.new(path: checkpoint_path_for(checkpoint_id))
|
|
117
|
+
else
|
|
118
|
+
raise ArgumentError, "Checkpoints not supported for #{storage.class}"
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
|
4
|
+
# Copyright:: Copyright (c) 2024 Lucas Carlson
|
|
5
|
+
# License:: LGPL
|
|
6
|
+
|
|
7
|
+
require 'json'
|
|
8
|
+
|
|
9
|
+
module Classifier
|
|
10
|
+
# TF-IDF vectorizer: transforms text to weighted feature vectors.
|
|
11
|
+
# Downweights common words, upweights discriminative terms.
|
|
12
|
+
#
|
|
13
|
+
# Example:
|
|
14
|
+
# tfidf = Classifier::TFIDF.new
|
|
15
|
+
# tfidf.fit(["Dogs are great pets", "Cats are independent"])
|
|
16
|
+
# tfidf.transform("Dogs are loyal") # => {:dog=>0.7071..., :loyal=>0.7071...}
|
|
17
|
+
#
|
|
18
|
+
class TFIDF
|
|
19
|
+
include Streaming
|
|
20
|
+
|
|
21
|
+
# @rbs @min_df: Integer | Float
|
|
22
|
+
# @rbs @max_df: Integer | Float
|
|
23
|
+
# @rbs @ngram_range: Array[Integer]
|
|
24
|
+
# @rbs @sublinear_tf: bool
|
|
25
|
+
# @rbs @vocabulary: Hash[Symbol, Integer]
|
|
26
|
+
# @rbs @idf: Hash[Symbol, Float]
|
|
27
|
+
# @rbs @num_documents: Integer
|
|
28
|
+
# @rbs @fitted: bool
|
|
29
|
+
# @rbs @dirty: bool
|
|
30
|
+
# @rbs @storage: Storage::Base?
|
|
31
|
+
|
|
32
|
+
attr_reader :vocabulary, :idf, :num_documents
|
|
33
|
+
attr_accessor :storage
|
|
34
|
+
|
|
35
|
+
# Creates a new TF-IDF vectorizer.
|
|
36
|
+
# - min_df/max_df: filter terms by document frequency (Integer for count, Float for proportion)
|
|
37
|
+
# - ngram_range: [1,1] for unigrams, [1,2] for unigrams+bigrams
|
|
38
|
+
# - sublinear_tf: use 1 + log(tf) instead of raw term frequency
|
|
39
|
+
#
|
|
40
|
+
# @rbs (?min_df: Integer | Float, ?max_df: Integer | Float,
|
|
41
|
+
# ?ngram_range: Array[Integer], ?sublinear_tf: bool) -> void
|
|
42
|
+
def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false)
|
|
43
|
+
validate_df!(min_df, 'min_df')
|
|
44
|
+
validate_df!(max_df, 'max_df')
|
|
45
|
+
validate_ngram_range!(ngram_range)
|
|
46
|
+
|
|
47
|
+
@min_df = min_df
|
|
48
|
+
@max_df = max_df
|
|
49
|
+
@ngram_range = ngram_range
|
|
50
|
+
@sublinear_tf = sublinear_tf
|
|
51
|
+
@vocabulary = {}
|
|
52
|
+
@idf = {}
|
|
53
|
+
@num_documents = 0
|
|
54
|
+
@fitted = false
|
|
55
|
+
@dirty = false
|
|
56
|
+
@storage = nil
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Learns vocabulary and IDF weights from the corpus.
|
|
60
|
+
# @rbs (Array[String]) -> self
|
|
61
|
+
def fit(documents)
|
|
62
|
+
raise ArgumentError, 'documents must be an array' unless documents.is_a?(Array)
|
|
63
|
+
raise ArgumentError, 'documents cannot be empty' if documents.empty?
|
|
64
|
+
|
|
65
|
+
@num_documents = documents.size
|
|
66
|
+
document_frequencies = Hash.new(0)
|
|
67
|
+
|
|
68
|
+
documents.each do |doc|
|
|
69
|
+
terms = extract_terms(doc)
|
|
70
|
+
terms.each_key { |term| document_frequencies[term] += 1 }
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
@vocabulary = {}
|
|
74
|
+
@idf = {}
|
|
75
|
+
vocab_index = 0
|
|
76
|
+
|
|
77
|
+
document_frequencies.each do |term, df|
|
|
78
|
+
next unless within_df_bounds?(df, @num_documents)
|
|
79
|
+
|
|
80
|
+
@vocabulary[term] = vocab_index
|
|
81
|
+
vocab_index += 1
|
|
82
|
+
|
|
83
|
+
# IDF: log((N + 1) / (df + 1)) + 1
|
|
84
|
+
@idf[term] = Math.log((@num_documents + 1).to_f / (df + 1)) + 1
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
@fitted = true
|
|
88
|
+
@dirty = true
|
|
89
|
+
self
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Transforms a document into a normalized TF-IDF vector.
|
|
93
|
+
# @rbs (String) -> Hash[Symbol, Float]
|
|
94
|
+
def transform(document)
|
|
95
|
+
raise NotFittedError, 'TFIDF has not been fitted. Call fit first.' unless @fitted
|
|
96
|
+
|
|
97
|
+
terms = extract_terms(document)
|
|
98
|
+
result = {} #: Hash[Symbol, Float]
|
|
99
|
+
|
|
100
|
+
terms.each do |term, tf|
|
|
101
|
+
next unless @vocabulary.key?(term)
|
|
102
|
+
|
|
103
|
+
tf_value = @sublinear_tf && tf.positive? ? 1 + Math.log(tf) : tf.to_f
|
|
104
|
+
result[term] = (tf_value * @idf[term]).to_f
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
normalize_vector(result)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Fits and transforms in one step.
|
|
111
|
+
# @rbs (Array[String]) -> Array[Hash[Symbol, Float]]
|
|
112
|
+
def fit_transform(documents)
|
|
113
|
+
fit(documents)
|
|
114
|
+
documents.map { |doc| transform(doc) }
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Returns vocabulary terms in index order.
|
|
118
|
+
# @rbs () -> Array[Symbol]
|
|
119
|
+
def feature_names
|
|
120
|
+
@vocabulary.keys.sort_by { |term| @vocabulary[term] }
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# @rbs () -> bool
|
|
124
|
+
def fitted?
|
|
125
|
+
@fitted
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Returns true if there are unsaved changes.
|
|
129
|
+
# @rbs () -> bool
|
|
130
|
+
def dirty?
|
|
131
|
+
@dirty
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Saves the vectorizer to the configured storage.
|
|
135
|
+
# @rbs () -> void
|
|
136
|
+
def save
|
|
137
|
+
raise ArgumentError, 'No storage configured' unless storage
|
|
138
|
+
|
|
139
|
+
storage.write(to_json)
|
|
140
|
+
@dirty = false
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Saves the vectorizer state to a file.
|
|
144
|
+
# @rbs (String) -> Integer
|
|
145
|
+
def save_to_file(path)
|
|
146
|
+
result = File.write(path, to_json)
|
|
147
|
+
@dirty = false
|
|
148
|
+
result
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Loads a vectorizer from the configured storage.
|
|
152
|
+
# @rbs (storage: Storage::Base) -> TFIDF
|
|
153
|
+
def self.load(storage:)
|
|
154
|
+
data = storage.read
|
|
155
|
+
raise StorageError, 'No saved state found' unless data
|
|
156
|
+
|
|
157
|
+
instance = from_json(data)
|
|
158
|
+
instance.storage = storage
|
|
159
|
+
instance
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Loads a vectorizer from a file.
|
|
163
|
+
# @rbs (String) -> TFIDF
|
|
164
|
+
def self.load_from_file(path)
|
|
165
|
+
from_json(File.read(path))
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Reloads the vectorizer from storage, raising if there are unsaved changes.
|
|
169
|
+
# @rbs () -> self
|
|
170
|
+
def reload
|
|
171
|
+
raise ArgumentError, 'No storage configured' unless storage
|
|
172
|
+
raise UnsavedChangesError, 'Unsaved changes would be lost. Call save first or use reload!' if @dirty
|
|
173
|
+
|
|
174
|
+
data = storage.read
|
|
175
|
+
raise StorageError, 'No saved state found' unless data
|
|
176
|
+
|
|
177
|
+
restore_from_json(data)
|
|
178
|
+
@dirty = false
|
|
179
|
+
self
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Force reloads the vectorizer from storage, discarding any unsaved changes.
|
|
183
|
+
# @rbs () -> self
|
|
184
|
+
def reload!
|
|
185
|
+
raise ArgumentError, 'No storage configured' unless storage
|
|
186
|
+
|
|
187
|
+
data = storage.read
|
|
188
|
+
raise StorageError, 'No saved state found' unless data
|
|
189
|
+
|
|
190
|
+
restore_from_json(data)
|
|
191
|
+
@dirty = false
|
|
192
|
+
self
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# @rbs (?untyped) -> Hash[Symbol, untyped]
|
|
196
|
+
def as_json(_options = nil)
|
|
197
|
+
{
|
|
198
|
+
version: 1,
|
|
199
|
+
type: 'tfidf',
|
|
200
|
+
min_df: @min_df,
|
|
201
|
+
max_df: @max_df,
|
|
202
|
+
ngram_range: @ngram_range,
|
|
203
|
+
sublinear_tf: @sublinear_tf,
|
|
204
|
+
vocabulary: @vocabulary,
|
|
205
|
+
idf: @idf,
|
|
206
|
+
num_documents: @num_documents,
|
|
207
|
+
fitted: @fitted
|
|
208
|
+
}
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# @rbs (?untyped) -> String
|
|
212
|
+
def to_json(_options = nil)
|
|
213
|
+
JSON.generate(as_json)
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Loads a vectorizer from JSON.
|
|
217
|
+
# @rbs (String | Hash[String, untyped]) -> TFIDF
|
|
218
|
+
def self.from_json(json)
|
|
219
|
+
data = json.is_a?(String) ? JSON.parse(json) : json
|
|
220
|
+
raise ArgumentError, "Invalid vectorizer type: #{data['type']}" unless data['type'] == 'tfidf'
|
|
221
|
+
|
|
222
|
+
instance = new(
|
|
223
|
+
min_df: data['min_df'],
|
|
224
|
+
max_df: data['max_df'],
|
|
225
|
+
ngram_range: data['ngram_range'],
|
|
226
|
+
sublinear_tf: data['sublinear_tf']
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
instance.instance_variable_set(:@vocabulary, symbolize_keys(data['vocabulary']))
|
|
230
|
+
instance.instance_variable_set(:@idf, symbolize_keys(data['idf']))
|
|
231
|
+
instance.instance_variable_set(:@num_documents, data['num_documents'])
|
|
232
|
+
instance.instance_variable_set(:@fitted, data['fitted'])
|
|
233
|
+
instance.instance_variable_set(:@dirty, false)
|
|
234
|
+
instance.instance_variable_set(:@storage, nil)
|
|
235
|
+
|
|
236
|
+
instance
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
# @rbs () -> Array[untyped]
|
|
240
|
+
def marshal_dump
|
|
241
|
+
[@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted]
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# @rbs (Array[untyped]) -> void
|
|
245
|
+
def marshal_load(data)
|
|
246
|
+
@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted = data
|
|
247
|
+
@dirty = false
|
|
248
|
+
@storage = nil
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Loads a vectorizer from a checkpoint.
|
|
252
|
+
#
|
|
253
|
+
# @rbs (storage: Storage::Base, checkpoint_id: String) -> TFIDF
|
|
254
|
+
def self.load_checkpoint(storage:, checkpoint_id:)
|
|
255
|
+
raise ArgumentError, 'Storage must be File storage for checkpoints' unless storage.is_a?(Storage::File)
|
|
256
|
+
|
|
257
|
+
dir = File.dirname(storage.path)
|
|
258
|
+
base = File.basename(storage.path, '.*')
|
|
259
|
+
ext = File.extname(storage.path)
|
|
260
|
+
checkpoint_path = File.join(dir, "#{base}_checkpoint_#{checkpoint_id}#{ext}")
|
|
261
|
+
|
|
262
|
+
checkpoint_storage = Storage::File.new(path: checkpoint_path)
|
|
263
|
+
instance = load(storage: checkpoint_storage)
|
|
264
|
+
instance.storage = storage
|
|
265
|
+
instance
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
# Fits the vectorizer from an IO stream.
|
|
269
|
+
# Collects all documents from the stream, then fits the model.
|
|
270
|
+
# Note: All documents must be collected in memory for IDF calculation.
|
|
271
|
+
#
|
|
272
|
+
# @example Fit from a file
|
|
273
|
+
# tfidf.fit_from_stream(File.open('corpus.txt'))
|
|
274
|
+
#
|
|
275
|
+
# @example With progress tracking
|
|
276
|
+
# tfidf.fit_from_stream(io, batch_size: 500) do |progress|
|
|
277
|
+
# puts "#{progress.completed} documents loaded"
|
|
278
|
+
# end
|
|
279
|
+
#
|
|
280
|
+
# @rbs (IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> self
|
|
281
|
+
def fit_from_stream(io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
|
|
282
|
+
reader = Streaming::LineReader.new(io, batch_size: batch_size)
|
|
283
|
+
total = reader.estimate_line_count
|
|
284
|
+
progress = Streaming::Progress.new(total: total)
|
|
285
|
+
|
|
286
|
+
documents = [] #: Array[String]
|
|
287
|
+
|
|
288
|
+
reader.each_batch do |batch|
|
|
289
|
+
documents.concat(batch)
|
|
290
|
+
progress.completed += batch.size
|
|
291
|
+
progress.current_batch += 1
|
|
292
|
+
yield progress if block_given?
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
fit(documents) unless documents.empty?
|
|
296
|
+
self
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
# TFIDF doesn't support train_from_stream (use fit_from_stream instead).
|
|
300
|
+
# This method raises NotImplementedError with guidance.
|
|
301
|
+
#
|
|
302
|
+
# @rbs (*untyped, **untyped) -> void
|
|
303
|
+
def train_from_stream(*) # steep:ignore
|
|
304
|
+
raise NotImplementedError, 'TFIDF uses fit_from_stream instead of train_from_stream'
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
# TFIDF doesn't support train_batch (use fit instead).
|
|
308
|
+
# This method raises NotImplementedError with guidance.
|
|
309
|
+
#
|
|
310
|
+
# @rbs (*untyped, **untyped) -> void
|
|
311
|
+
def train_batch(*) # steep:ignore
|
|
312
|
+
raise NotImplementedError, 'TFIDF uses fit instead of train_batch'
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
private
|
|
316
|
+
|
|
317
|
+
# Restores vectorizer state from JSON string.
|
|
318
|
+
# @rbs (String) -> void
|
|
319
|
+
def restore_from_json(json)
|
|
320
|
+
data = JSON.parse(json)
|
|
321
|
+
|
|
322
|
+
@min_df = data['min_df']
|
|
323
|
+
@max_df = data['max_df']
|
|
324
|
+
@ngram_range = data['ngram_range']
|
|
325
|
+
@sublinear_tf = data['sublinear_tf']
|
|
326
|
+
@vocabulary = self.class.send(:symbolize_keys, data['vocabulary'])
|
|
327
|
+
@idf = self.class.send(:symbolize_keys, data['idf'])
|
|
328
|
+
@num_documents = data['num_documents']
|
|
329
|
+
@fitted = data['fitted']
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
# @rbs (String) -> Hash[Symbol, Integer]
|
|
333
|
+
def extract_terms(document)
|
|
334
|
+
result = Hash.new(0)
|
|
335
|
+
|
|
336
|
+
if @ngram_range[0] <= 1
|
|
337
|
+
word_hash = document.clean_word_hash
|
|
338
|
+
word_hash.each { |term, count| result[term] += count }
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
return result if @ngram_range[1] <= 1
|
|
342
|
+
|
|
343
|
+
tokens = tokenize_for_ngrams(document)
|
|
344
|
+
(2..@ngram_range[1]).each do |n|
|
|
345
|
+
next if n < @ngram_range[0]
|
|
346
|
+
|
|
347
|
+
generate_ngrams(tokens, n).each { |ngram| result[ngram] += 1 }
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
result
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
# @rbs (String) -> Array[String]
|
|
354
|
+
def tokenize_for_ngrams(document)
|
|
355
|
+
document
|
|
356
|
+
.gsub(/[^\w\s]/, '')
|
|
357
|
+
.split
|
|
358
|
+
.map(&:downcase)
|
|
359
|
+
.reject { |w| w.length <= 2 || String::CORPUS_SKIP_WORDS.include?(w) }
|
|
360
|
+
.map(&:stem)
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
# @rbs (Array[String], Integer) -> Array[Symbol]
|
|
364
|
+
def generate_ngrams(tokens, n) # rubocop:disable Naming/MethodParameterName
|
|
365
|
+
return [] if tokens.size < n
|
|
366
|
+
|
|
367
|
+
tokens.each_cons(n).map { |gram| gram.join('_').intern }
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
# @rbs (Integer, Integer) -> bool
|
|
371
|
+
def within_df_bounds?(doc_freq, num_docs)
|
|
372
|
+
doc_freq.between?(
|
|
373
|
+
@min_df.is_a?(Float) ? (@min_df * num_docs).ceil : @min_df,
|
|
374
|
+
@max_df.is_a?(Float) ? (@max_df * num_docs).floor : @max_df
|
|
375
|
+
)
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
# @rbs (Hash[Symbol, Float]) -> Hash[Symbol, Float]
|
|
379
|
+
def normalize_vector(vector)
|
|
380
|
+
return vector if vector.empty?
|
|
381
|
+
|
|
382
|
+
magnitude = Math.sqrt(vector.values.sum { |v| v * v })
|
|
383
|
+
return vector if magnitude.zero?
|
|
384
|
+
|
|
385
|
+
vector.transform_values { |v| v / magnitude }
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
# @rbs (Integer | Float, String) -> void
|
|
389
|
+
def validate_df!(value, name)
|
|
390
|
+
raise ArgumentError, "#{name} must be an Integer or Float" unless value.is_a?(Float) || value.is_a?(Integer)
|
|
391
|
+
raise ArgumentError, "#{name} must be between 0.0 and 1.0" if value.is_a?(Float) && !value.between?(0.0, 1.0)
|
|
392
|
+
raise ArgumentError, "#{name} must be non-negative" if value.is_a?(Integer) && value.negative?
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
# @rbs (Array[Integer]) -> void
|
|
396
|
+
def validate_ngram_range!(range)
|
|
397
|
+
raise ArgumentError, 'ngram_range must be an array of two integers' unless range.is_a?(Array) && range.size == 2
|
|
398
|
+
raise ArgumentError, 'ngram_range values must be positive integers' unless range.all?(Integer) && range.all?(&:positive?)
|
|
399
|
+
raise ArgumentError, 'ngram_range[0] must be <= ngram_range[1]' if range[0] > range[1]
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
# @rbs (Hash[String, untyped]) -> Hash[Symbol, untyped]
|
|
403
|
+
def self.symbolize_keys(hash)
|
|
404
|
+
hash.transform_keys(&:to_sym)
|
|
405
|
+
end
|
|
406
|
+
private_class_method :symbolize_keys
|
|
407
|
+
end
|
|
408
|
+
end
|
data/lib/classifier.rb
CHANGED
|
@@ -27,7 +27,11 @@
|
|
|
27
27
|
require 'rubygems'
|
|
28
28
|
require 'classifier/errors'
|
|
29
29
|
require 'classifier/storage'
|
|
30
|
+
require 'classifier/streaming'
|
|
30
31
|
require 'classifier/extensions/string'
|
|
31
32
|
require 'classifier/extensions/vector'
|
|
32
33
|
require 'classifier/bayes'
|
|
33
34
|
require 'classifier/lsi'
|
|
35
|
+
require 'classifier/knn'
|
|
36
|
+
require 'classifier/tfidf'
|
|
37
|
+
require 'classifier/logistic_regression'
|
data/sig/vendor/matrix.rbs
CHANGED
|
@@ -1,26 +1,37 @@
|
|
|
1
1
|
# Type stubs for matrix gem
|
|
2
|
-
|
|
2
|
+
# Using untyped elements since our usage is primarily with Floats/Numerics
|
|
3
|
+
class Vector
|
|
3
4
|
EPSILON: Float
|
|
4
5
|
|
|
5
|
-
def self.[]:
|
|
6
|
+
def self.[]: (*untyped) -> Vector
|
|
6
7
|
def size: () -> Integer
|
|
7
|
-
def []: (Integer) ->
|
|
8
|
+
def []: (Integer) -> untyped
|
|
8
9
|
def magnitude: () -> Float
|
|
9
|
-
def normalize: () -> Vector
|
|
10
|
-
def each: () { (
|
|
11
|
-
def collect:
|
|
12
|
-
def to_a: () -> Array[
|
|
10
|
+
def normalize: () -> Vector
|
|
11
|
+
def each: () { (untyped) -> void } -> void
|
|
12
|
+
def collect: () { (untyped) -> untyped } -> Vector
|
|
13
|
+
def to_a: () -> Array[untyped]
|
|
13
14
|
def *: (untyped) -> untyped
|
|
15
|
+
def -: (Vector) -> Vector
|
|
16
|
+
def is_a?: (untyped) -> bool
|
|
14
17
|
end
|
|
15
18
|
|
|
16
|
-
class Matrix
|
|
17
|
-
def self.rows:
|
|
18
|
-
def self.[]:
|
|
19
|
-
def self.diag: (untyped) -> Matrix
|
|
20
|
-
def
|
|
19
|
+
class Matrix
|
|
20
|
+
def self.rows: (Array[Array[untyped]]) -> Matrix
|
|
21
|
+
def self.[]: (*Array[untyped]) -> Matrix
|
|
22
|
+
def self.diag: (untyped) -> Matrix
|
|
23
|
+
def self.columns: (Array[Array[untyped]]) -> Matrix
|
|
24
|
+
def self.empty: (Integer, Integer) -> Matrix
|
|
25
|
+
def self.zero: (Integer, Integer) -> Matrix
|
|
26
|
+
def self.vstack: (Matrix, Matrix) -> Matrix
|
|
27
|
+
def trans: () -> Matrix
|
|
28
|
+
def transpose: () -> Matrix
|
|
21
29
|
def *: (untyped) -> untyped
|
|
22
30
|
def row_size: () -> Integer
|
|
23
31
|
def column_size: () -> Integer
|
|
24
|
-
def
|
|
25
|
-
def
|
|
32
|
+
def row: (Integer) -> Vector
|
|
33
|
+
def column: (Integer) -> Vector
|
|
34
|
+
def SV_decomp: () -> [Matrix, Matrix, untyped]
|
|
35
|
+
def is_a?: (untyped) -> bool
|
|
36
|
+
def respond_to?: (Symbol) -> bool
|
|
26
37
|
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Type stubs for Streaming module
|
|
2
|
+
# Defines the interface that including classes must implement
|
|
3
|
+
|
|
4
|
+
module Classifier
|
|
5
|
+
# Interface for classes that include Streaming
|
|
6
|
+
interface _StreamingHost
|
|
7
|
+
def storage: () -> Storage::Base?
|
|
8
|
+
def storage=: (Storage::Base?) -> void
|
|
9
|
+
def save: () -> void
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
module Streaming : _StreamingHost
|
|
13
|
+
end
|
|
14
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: classifier
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Lucas Carlson
|
|
@@ -121,7 +121,10 @@ dependencies:
|
|
|
121
121
|
- - ">="
|
|
122
122
|
- !ruby/object:Gem::Version
|
|
123
123
|
version: '0'
|
|
124
|
-
description: A
|
|
124
|
+
description: A Ruby library for text classification featuring Naive Bayes, LSI (Latent
|
|
125
|
+
Semantic Indexing), Logistic Regression, and k-Nearest Neighbors classifiers. Includes
|
|
126
|
+
TF-IDF vectorization, streaming/incremental training, pluggable persistence backends,
|
|
127
|
+
thread safety, and a native C extension for fast LSI operations.
|
|
125
128
|
email: lucas@rufy.com
|
|
126
129
|
executables: []
|
|
127
130
|
extensions:
|
|
@@ -135,6 +138,7 @@ files:
|
|
|
135
138
|
- bin/summarize.rb
|
|
136
139
|
- ext/classifier/classifier_ext.c
|
|
137
140
|
- ext/classifier/extconf.rb
|
|
141
|
+
- ext/classifier/incremental_svd.c
|
|
138
142
|
- ext/classifier/linalg.h
|
|
139
143
|
- ext/classifier/matrix.c
|
|
140
144
|
- ext/classifier/svd.c
|
|
@@ -145,19 +149,27 @@ files:
|
|
|
145
149
|
- lib/classifier/extensions/string.rb
|
|
146
150
|
- lib/classifier/extensions/vector.rb
|
|
147
151
|
- lib/classifier/extensions/word_hash.rb
|
|
152
|
+
- lib/classifier/knn.rb
|
|
153
|
+
- lib/classifier/logistic_regression.rb
|
|
148
154
|
- lib/classifier/lsi.rb
|
|
149
155
|
- lib/classifier/lsi/content_node.rb
|
|
156
|
+
- lib/classifier/lsi/incremental_svd.rb
|
|
150
157
|
- lib/classifier/lsi/summary.rb
|
|
151
158
|
- lib/classifier/lsi/word_list.rb
|
|
152
159
|
- lib/classifier/storage.rb
|
|
153
160
|
- lib/classifier/storage/base.rb
|
|
154
161
|
- lib/classifier/storage/file.rb
|
|
155
162
|
- lib/classifier/storage/memory.rb
|
|
163
|
+
- lib/classifier/streaming.rb
|
|
164
|
+
- lib/classifier/streaming/line_reader.rb
|
|
165
|
+
- lib/classifier/streaming/progress.rb
|
|
166
|
+
- lib/classifier/tfidf.rb
|
|
156
167
|
- sig/vendor/fast_stemmer.rbs
|
|
157
168
|
- sig/vendor/gsl.rbs
|
|
158
169
|
- sig/vendor/json.rbs
|
|
159
170
|
- sig/vendor/matrix.rbs
|
|
160
171
|
- sig/vendor/mutex_m.rbs
|
|
172
|
+
- sig/vendor/streaming.rbs
|
|
161
173
|
- test/test_helper.rb
|
|
162
174
|
homepage: https://rubyclassifier.com
|
|
163
175
|
licenses:
|
|
@@ -174,7 +186,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
174
186
|
requirements:
|
|
175
187
|
- - ">="
|
|
176
188
|
- !ruby/object:Gem::Version
|
|
177
|
-
version: '
|
|
189
|
+
version: '3.1'
|
|
178
190
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
179
191
|
requirements:
|
|
180
192
|
- - ">="
|
|
@@ -183,5 +195,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
183
195
|
requirements: []
|
|
184
196
|
rubygems_version: 4.0.3
|
|
185
197
|
specification_version: 4
|
|
186
|
-
summary:
|
|
198
|
+
summary: Text classification with Bayesian, LSI, Logistic Regression, kNN, and TF-IDF
|
|
199
|
+
vectorization.
|
|
187
200
|
test_files: []
|