classifier 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,122 @@
1
+ # rbs_inline: enabled
2
+
3
+ require_relative 'streaming/progress'
4
+ require_relative 'streaming/line_reader'
5
+
6
+ module Classifier
7
+ # Streaming module provides memory-efficient training capabilities for classifiers.
8
+ # Include this module in a classifier to add streaming and batch training methods.
9
+ #
10
+ # @example Including in a classifier
11
+ # class MyClassifier
12
+ # include Classifier::Streaming
13
+ # end
14
+ #
15
+ # @example Streaming training
16
+ # classifier.train_from_stream(:category, File.open('corpus.txt'))
17
+ #
18
+ # @example Batch training with progress
19
+ # classifier.train_batch(:category, documents, batch_size: 100) do |progress|
20
+ # puts "#{progress.percent}% complete"
21
+ # end
22
+ module Streaming
23
+ # Default batch size for streaming operations
24
+ DEFAULT_BATCH_SIZE = 100
25
+
26
+ # Trains the classifier from an IO stream.
27
+ # Each line in the stream is treated as a separate document.
28
+ #
29
+ # @rbs (Symbol | String, IO, ?batch_size: Integer) { (Progress) -> void } -> void
30
+ def train_from_stream(category, io, batch_size: DEFAULT_BATCH_SIZE, &block)
31
+ raise NotImplementedError, "#{self.class} must implement train_from_stream"
32
+ end
33
+
34
+ # Trains the classifier with an array of documents in batches.
35
+ # Supports both positional and keyword argument styles.
36
+ #
37
+ # @example Positional style
38
+ # classifier.train_batch(:spam, documents, batch_size: 100)
39
+ #
40
+ # @example Keyword style
41
+ # classifier.train_batch(spam: documents, ham: other_docs, batch_size: 100)
42
+ #
43
+ # @rbs (?(Symbol | String)?, ?Array[String]?, ?batch_size: Integer, **Array[String]) { (Progress) -> void } -> void
44
+ def train_batch(category = nil, documents = nil, batch_size: DEFAULT_BATCH_SIZE, **categories, &block)
45
+ raise NotImplementedError, "#{self.class} must implement train_batch"
46
+ end
47
+
48
+ # Saves a checkpoint of the current training state.
49
+ # Requires a storage backend to be configured.
50
+ #
51
+ # @rbs (String) -> void
52
+ def save_checkpoint(checkpoint_id)
53
+ raise ArgumentError, 'No storage configured' unless respond_to?(:storage) && storage
54
+
55
+ original_storage = storage
56
+
57
+ begin
58
+ self.storage = checkpoint_storage_for(checkpoint_id)
59
+ save
60
+ ensure
61
+ self.storage = original_storage
62
+ end
63
+ end
64
+
65
+ # Lists available checkpoints.
66
+ # Requires a storage backend to be configured.
67
+ #
68
+ # @rbs () -> Array[String]
69
+ def list_checkpoints
70
+ raise ArgumentError, 'No storage configured' unless respond_to?(:storage) && storage
71
+
72
+ case storage
73
+ when Storage::File
74
+ file_storage = storage #: Storage::File
75
+ dir = File.dirname(file_storage.path)
76
+ base = File.basename(file_storage.path, '.*')
77
+ ext = File.extname(file_storage.path)
78
+
79
+ pattern = File.join(dir, "#{base}_checkpoint_*#{ext}")
80
+ Dir.glob(pattern).map do |path|
81
+ File.basename(path, ext).sub(/^#{Regexp.escape(base)}_checkpoint_/, '')
82
+ end.sort
83
+ else
84
+ []
85
+ end
86
+ end
87
+
88
+ # Deletes a checkpoint.
89
+ #
90
+ # @rbs (String) -> void
91
+ def delete_checkpoint(checkpoint_id)
92
+ raise ArgumentError, 'No storage configured' unless respond_to?(:storage) && storage
93
+
94
+ checkpoint_storage = checkpoint_storage_for(checkpoint_id)
95
+ checkpoint_storage.delete if checkpoint_storage.exists?
96
+ end
97
+
98
+ private
99
+
100
+ # @rbs (String) -> String
101
+ def checkpoint_path_for(checkpoint_id)
102
+ raise ArgumentError, 'Storage must be File storage for checkpoints' unless storage.is_a?(Storage::File)
103
+
104
+ file_storage = storage #: Storage::File
105
+ dir = File.dirname(file_storage.path)
106
+ base = File.basename(file_storage.path, '.*')
107
+ ext = File.extname(file_storage.path)
108
+
109
+ File.join(dir, "#{base}_checkpoint_#{checkpoint_id}#{ext}")
110
+ end
111
+
112
+ # @rbs (String) -> Storage::Base
113
+ def checkpoint_storage_for(checkpoint_id)
114
+ case storage
115
+ when Storage::File
116
+ Storage::File.new(path: checkpoint_path_for(checkpoint_id))
117
+ else
118
+ raise ArgumentError, "Checkpoints not supported for #{storage.class}"
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,408 @@
1
+ # rbs_inline: enabled
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2024 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ require 'json'
8
+
9
+ module Classifier
10
+ # TF-IDF vectorizer: transforms text to weighted feature vectors.
11
+ # Downweights common words, upweights discriminative terms.
12
+ #
13
+ # Example:
14
+ # tfidf = Classifier::TFIDF.new
15
+ # tfidf.fit(["Dogs are great pets", "Cats are independent"])
16
+ # tfidf.transform("Dogs are loyal") # => {:dog=>0.7071..., :loyal=>0.7071...}
17
+ #
18
+ class TFIDF
19
+ include Streaming
20
+
21
+ # @rbs @min_df: Integer | Float
22
+ # @rbs @max_df: Integer | Float
23
+ # @rbs @ngram_range: Array[Integer]
24
+ # @rbs @sublinear_tf: bool
25
+ # @rbs @vocabulary: Hash[Symbol, Integer]
26
+ # @rbs @idf: Hash[Symbol, Float]
27
+ # @rbs @num_documents: Integer
28
+ # @rbs @fitted: bool
29
+ # @rbs @dirty: bool
30
+ # @rbs @storage: Storage::Base?
31
+
32
+ attr_reader :vocabulary, :idf, :num_documents
33
+ attr_accessor :storage
34
+
35
+ # Creates a new TF-IDF vectorizer.
36
+ # - min_df/max_df: filter terms by document frequency (Integer for count, Float for proportion)
37
+ # - ngram_range: [1,1] for unigrams, [1,2] for unigrams+bigrams
38
+ # - sublinear_tf: use 1 + log(tf) instead of raw term frequency
39
+ #
40
+ # @rbs (?min_df: Integer | Float, ?max_df: Integer | Float,
41
+ # ?ngram_range: Array[Integer], ?sublinear_tf: bool) -> void
42
+ def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false)
43
+ validate_df!(min_df, 'min_df')
44
+ validate_df!(max_df, 'max_df')
45
+ validate_ngram_range!(ngram_range)
46
+
47
+ @min_df = min_df
48
+ @max_df = max_df
49
+ @ngram_range = ngram_range
50
+ @sublinear_tf = sublinear_tf
51
+ @vocabulary = {}
52
+ @idf = {}
53
+ @num_documents = 0
54
+ @fitted = false
55
+ @dirty = false
56
+ @storage = nil
57
+ end
58
+
59
+ # Learns vocabulary and IDF weights from the corpus.
60
+ # @rbs (Array[String]) -> self
61
+ def fit(documents)
62
+ raise ArgumentError, 'documents must be an array' unless documents.is_a?(Array)
63
+ raise ArgumentError, 'documents cannot be empty' if documents.empty?
64
+
65
+ @num_documents = documents.size
66
+ document_frequencies = Hash.new(0)
67
+
68
+ documents.each do |doc|
69
+ terms = extract_terms(doc)
70
+ terms.each_key { |term| document_frequencies[term] += 1 }
71
+ end
72
+
73
+ @vocabulary = {}
74
+ @idf = {}
75
+ vocab_index = 0
76
+
77
+ document_frequencies.each do |term, df|
78
+ next unless within_df_bounds?(df, @num_documents)
79
+
80
+ @vocabulary[term] = vocab_index
81
+ vocab_index += 1
82
+
83
+ # IDF: log((N + 1) / (df + 1)) + 1
84
+ @idf[term] = Math.log((@num_documents + 1).to_f / (df + 1)) + 1
85
+ end
86
+
87
+ @fitted = true
88
+ @dirty = true
89
+ self
90
+ end
91
+
92
+ # Transforms a document into a normalized TF-IDF vector.
93
+ # @rbs (String) -> Hash[Symbol, Float]
94
+ def transform(document)
95
+ raise NotFittedError, 'TFIDF has not been fitted. Call fit first.' unless @fitted
96
+
97
+ terms = extract_terms(document)
98
+ result = {} #: Hash[Symbol, Float]
99
+
100
+ terms.each do |term, tf|
101
+ next unless @vocabulary.key?(term)
102
+
103
+ tf_value = @sublinear_tf && tf.positive? ? 1 + Math.log(tf) : tf.to_f
104
+ result[term] = (tf_value * @idf[term]).to_f
105
+ end
106
+
107
+ normalize_vector(result)
108
+ end
109
+
110
+ # Fits and transforms in one step.
111
+ # @rbs (Array[String]) -> Array[Hash[Symbol, Float]]
112
+ def fit_transform(documents)
113
+ fit(documents)
114
+ documents.map { |doc| transform(doc) }
115
+ end
116
+
117
+ # Returns vocabulary terms in index order.
118
+ # @rbs () -> Array[Symbol]
119
+ def feature_names
120
+ @vocabulary.keys.sort_by { |term| @vocabulary[term] }
121
+ end
122
+
123
+ # @rbs () -> bool
124
+ def fitted?
125
+ @fitted
126
+ end
127
+
128
+ # Returns true if there are unsaved changes.
129
+ # @rbs () -> bool
130
+ def dirty?
131
+ @dirty
132
+ end
133
+
134
+ # Saves the vectorizer to the configured storage.
135
+ # @rbs () -> void
136
+ def save
137
+ raise ArgumentError, 'No storage configured' unless storage
138
+
139
+ storage.write(to_json)
140
+ @dirty = false
141
+ end
142
+
143
+ # Saves the vectorizer state to a file.
144
+ # @rbs (String) -> Integer
145
+ def save_to_file(path)
146
+ result = File.write(path, to_json)
147
+ @dirty = false
148
+ result
149
+ end
150
+
151
+ # Loads a vectorizer from the configured storage.
152
+ # @rbs (storage: Storage::Base) -> TFIDF
153
+ def self.load(storage:)
154
+ data = storage.read
155
+ raise StorageError, 'No saved state found' unless data
156
+
157
+ instance = from_json(data)
158
+ instance.storage = storage
159
+ instance
160
+ end
161
+
162
+ # Loads a vectorizer from a file.
163
+ # @rbs (String) -> TFIDF
164
+ def self.load_from_file(path)
165
+ from_json(File.read(path))
166
+ end
167
+
168
+ # Reloads the vectorizer from storage, raising if there are unsaved changes.
169
+ # @rbs () -> self
170
+ def reload
171
+ raise ArgumentError, 'No storage configured' unless storage
172
+ raise UnsavedChangesError, 'Unsaved changes would be lost. Call save first or use reload!' if @dirty
173
+
174
+ data = storage.read
175
+ raise StorageError, 'No saved state found' unless data
176
+
177
+ restore_from_json(data)
178
+ @dirty = false
179
+ self
180
+ end
181
+
182
+ # Force reloads the vectorizer from storage, discarding any unsaved changes.
183
+ # @rbs () -> self
184
+ def reload!
185
+ raise ArgumentError, 'No storage configured' unless storage
186
+
187
+ data = storage.read
188
+ raise StorageError, 'No saved state found' unless data
189
+
190
+ restore_from_json(data)
191
+ @dirty = false
192
+ self
193
+ end
194
+
195
+ # @rbs (?untyped) -> Hash[Symbol, untyped]
196
+ def as_json(_options = nil)
197
+ {
198
+ version: 1,
199
+ type: 'tfidf',
200
+ min_df: @min_df,
201
+ max_df: @max_df,
202
+ ngram_range: @ngram_range,
203
+ sublinear_tf: @sublinear_tf,
204
+ vocabulary: @vocabulary,
205
+ idf: @idf,
206
+ num_documents: @num_documents,
207
+ fitted: @fitted
208
+ }
209
+ end
210
+
211
+ # @rbs (?untyped) -> String
212
+ def to_json(_options = nil)
213
+ JSON.generate(as_json)
214
+ end
215
+
216
+ # Loads a vectorizer from JSON.
217
+ # @rbs (String | Hash[String, untyped]) -> TFIDF
218
+ def self.from_json(json)
219
+ data = json.is_a?(String) ? JSON.parse(json) : json
220
+ raise ArgumentError, "Invalid vectorizer type: #{data['type']}" unless data['type'] == 'tfidf'
221
+
222
+ instance = new(
223
+ min_df: data['min_df'],
224
+ max_df: data['max_df'],
225
+ ngram_range: data['ngram_range'],
226
+ sublinear_tf: data['sublinear_tf']
227
+ )
228
+
229
+ instance.instance_variable_set(:@vocabulary, symbolize_keys(data['vocabulary']))
230
+ instance.instance_variable_set(:@idf, symbolize_keys(data['idf']))
231
+ instance.instance_variable_set(:@num_documents, data['num_documents'])
232
+ instance.instance_variable_set(:@fitted, data['fitted'])
233
+ instance.instance_variable_set(:@dirty, false)
234
+ instance.instance_variable_set(:@storage, nil)
235
+
236
+ instance
237
+ end
238
+
239
+ # @rbs () -> Array[untyped]
240
+ def marshal_dump
241
+ [@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted]
242
+ end
243
+
244
+ # @rbs (Array[untyped]) -> void
245
+ def marshal_load(data)
246
+ @min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted = data
247
+ @dirty = false
248
+ @storage = nil
249
+ end
250
+
251
+ # Loads a vectorizer from a checkpoint.
252
+ #
253
+ # @rbs (storage: Storage::Base, checkpoint_id: String) -> TFIDF
254
+ def self.load_checkpoint(storage:, checkpoint_id:)
255
+ raise ArgumentError, 'Storage must be File storage for checkpoints' unless storage.is_a?(Storage::File)
256
+
257
+ dir = File.dirname(storage.path)
258
+ base = File.basename(storage.path, '.*')
259
+ ext = File.extname(storage.path)
260
+ checkpoint_path = File.join(dir, "#{base}_checkpoint_#{checkpoint_id}#{ext}")
261
+
262
+ checkpoint_storage = Storage::File.new(path: checkpoint_path)
263
+ instance = load(storage: checkpoint_storage)
264
+ instance.storage = storage
265
+ instance
266
+ end
267
+
268
+ # Fits the vectorizer from an IO stream.
269
+ # Collects all documents from the stream, then fits the model.
270
+ # Note: All documents must be collected in memory for IDF calculation.
271
+ #
272
+ # @example Fit from a file
273
+ # tfidf.fit_from_stream(File.open('corpus.txt'))
274
+ #
275
+ # @example With progress tracking
276
+ # tfidf.fit_from_stream(io, batch_size: 500) do |progress|
277
+ # puts "#{progress.completed} documents loaded"
278
+ # end
279
+ #
280
+ # @rbs (IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> self
281
+ def fit_from_stream(io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
282
+ reader = Streaming::LineReader.new(io, batch_size: batch_size)
283
+ total = reader.estimate_line_count
284
+ progress = Streaming::Progress.new(total: total)
285
+
286
+ documents = [] #: Array[String]
287
+
288
+ reader.each_batch do |batch|
289
+ documents.concat(batch)
290
+ progress.completed += batch.size
291
+ progress.current_batch += 1
292
+ yield progress if block_given?
293
+ end
294
+
295
+ fit(documents) unless documents.empty?
296
+ self
297
+ end
298
+
299
+ # TFIDF doesn't support train_from_stream (use fit_from_stream instead).
300
+ # This method raises NotImplementedError with guidance.
301
+ #
302
+ # @rbs (*untyped, **untyped) -> void
303
+ def train_from_stream(*) # steep:ignore
304
+ raise NotImplementedError, 'TFIDF uses fit_from_stream instead of train_from_stream'
305
+ end
306
+
307
+ # TFIDF doesn't support train_batch (use fit instead).
308
+ # This method raises NotImplementedError with guidance.
309
+ #
310
+ # @rbs (*untyped, **untyped) -> void
311
+ def train_batch(*) # steep:ignore
312
+ raise NotImplementedError, 'TFIDF uses fit instead of train_batch'
313
+ end
314
+
315
+ private
316
+
317
+ # Restores vectorizer state from JSON string.
318
+ # @rbs (String) -> void
319
+ def restore_from_json(json)
320
+ data = JSON.parse(json)
321
+
322
+ @min_df = data['min_df']
323
+ @max_df = data['max_df']
324
+ @ngram_range = data['ngram_range']
325
+ @sublinear_tf = data['sublinear_tf']
326
+ @vocabulary = self.class.send(:symbolize_keys, data['vocabulary'])
327
+ @idf = self.class.send(:symbolize_keys, data['idf'])
328
+ @num_documents = data['num_documents']
329
+ @fitted = data['fitted']
330
+ end
331
+
332
+ # @rbs (String) -> Hash[Symbol, Integer]
333
+ def extract_terms(document)
334
+ result = Hash.new(0)
335
+
336
+ if @ngram_range[0] <= 1
337
+ word_hash = document.clean_word_hash
338
+ word_hash.each { |term, count| result[term] += count }
339
+ end
340
+
341
+ return result if @ngram_range[1] <= 1
342
+
343
+ tokens = tokenize_for_ngrams(document)
344
+ (2..@ngram_range[1]).each do |n|
345
+ next if n < @ngram_range[0]
346
+
347
+ generate_ngrams(tokens, n).each { |ngram| result[ngram] += 1 }
348
+ end
349
+
350
+ result
351
+ end
352
+
353
+ # @rbs (String) -> Array[String]
354
+ def tokenize_for_ngrams(document)
355
+ document
356
+ .gsub(/[^\w\s]/, '')
357
+ .split
358
+ .map(&:downcase)
359
+ .reject { |w| w.length <= 2 || String::CORPUS_SKIP_WORDS.include?(w) }
360
+ .map(&:stem)
361
+ end
362
+
363
+ # @rbs (Array[String], Integer) -> Array[Symbol]
364
+ def generate_ngrams(tokens, n) # rubocop:disable Naming/MethodParameterName
365
+ return [] if tokens.size < n
366
+
367
+ tokens.each_cons(n).map { |gram| gram.join('_').intern }
368
+ end
369
+
370
+ # @rbs (Integer, Integer) -> bool
371
+ def within_df_bounds?(doc_freq, num_docs)
372
+ doc_freq.between?(
373
+ @min_df.is_a?(Float) ? (@min_df * num_docs).ceil : @min_df,
374
+ @max_df.is_a?(Float) ? (@max_df * num_docs).floor : @max_df
375
+ )
376
+ end
377
+
378
+ # @rbs (Hash[Symbol, Float]) -> Hash[Symbol, Float]
379
+ def normalize_vector(vector)
380
+ return vector if vector.empty?
381
+
382
+ magnitude = Math.sqrt(vector.values.sum { |v| v * v })
383
+ return vector if magnitude.zero?
384
+
385
+ vector.transform_values { |v| v / magnitude }
386
+ end
387
+
388
+ # @rbs (Integer | Float, String) -> void
389
+ def validate_df!(value, name)
390
+ raise ArgumentError, "#{name} must be an Integer or Float" unless value.is_a?(Float) || value.is_a?(Integer)
391
+ raise ArgumentError, "#{name} must be between 0.0 and 1.0" if value.is_a?(Float) && !value.between?(0.0, 1.0)
392
+ raise ArgumentError, "#{name} must be non-negative" if value.is_a?(Integer) && value.negative?
393
+ end
394
+
395
+ # @rbs (Array[Integer]) -> void
396
+ def validate_ngram_range!(range)
397
+ raise ArgumentError, 'ngram_range must be an array of two integers' unless range.is_a?(Array) && range.size == 2
398
+ raise ArgumentError, 'ngram_range values must be positive integers' unless range.all?(Integer) && range.all?(&:positive?)
399
+ raise ArgumentError, 'ngram_range[0] must be <= ngram_range[1]' if range[0] > range[1]
400
+ end
401
+
402
+ # @rbs (Hash[String, untyped]) -> Hash[Symbol, untyped]
403
+ def self.symbolize_keys(hash)
404
+ hash.transform_keys(&:to_sym)
405
+ end
406
+ private_class_method :symbolize_keys
407
+ end
408
+ end
data/lib/classifier.rb CHANGED
@@ -27,7 +27,11 @@
27
27
  require 'rubygems'
28
28
  require 'classifier/errors'
29
29
  require 'classifier/storage'
30
+ require 'classifier/streaming'
30
31
  require 'classifier/extensions/string'
31
32
  require 'classifier/extensions/vector'
32
33
  require 'classifier/bayes'
33
34
  require 'classifier/lsi'
35
+ require 'classifier/knn'
36
+ require 'classifier/tfidf'
37
+ require 'classifier/logistic_regression'
@@ -1,26 +1,37 @@
1
1
  # Type stubs for matrix gem
2
- class Vector[T]
2
+ # Using untyped elements since our usage is primarily with Floats/Numerics
3
+ class Vector
3
4
  EPSILON: Float
4
5
 
5
- def self.[]: [T] (*T) -> Vector[T]
6
+ def self.[]: (*untyped) -> Vector
6
7
  def size: () -> Integer
7
- def []: (Integer) -> T
8
+ def []: (Integer) -> untyped
8
9
  def magnitude: () -> Float
9
- def normalize: () -> Vector[T]
10
- def each: () { (T) -> void } -> void
11
- def collect: [U] () { (T) -> U } -> Vector[U]
12
- def to_a: () -> Array[T]
10
+ def normalize: () -> Vector
11
+ def each: () { (untyped) -> void } -> void
12
+ def collect: () { (untyped) -> untyped } -> Vector
13
+ def to_a: () -> Array[untyped]
13
14
  def *: (untyped) -> untyped
15
+ def -: (Vector) -> Vector
16
+ def is_a?: (untyped) -> bool
14
17
  end
15
18
 
16
- class Matrix[T]
17
- def self.rows: [T] (Array[Array[T]]) -> Matrix[T]
18
- def self.[]: [T] (*Array[T]) -> Matrix[T]
19
- def self.diag: (untyped) -> Matrix[untyped]
20
- def trans: () -> Matrix[T]
19
+ class Matrix
20
+ def self.rows: (Array[Array[untyped]]) -> Matrix
21
+ def self.[]: (*Array[untyped]) -> Matrix
22
+ def self.diag: (untyped) -> Matrix
23
+ def self.columns: (Array[Array[untyped]]) -> Matrix
24
+ def self.empty: (Integer, Integer) -> Matrix
25
+ def self.zero: (Integer, Integer) -> Matrix
26
+ def self.vstack: (Matrix, Matrix) -> Matrix
27
+ def trans: () -> Matrix
28
+ def transpose: () -> Matrix
21
29
  def *: (untyped) -> untyped
22
30
  def row_size: () -> Integer
23
31
  def column_size: () -> Integer
24
- def column: (Integer) -> Vector[T]
25
- def SV_decomp: () -> [Matrix[T], Matrix[T], untyped]
32
+ def row: (Integer) -> Vector
33
+ def column: (Integer) -> Vector
34
+ def SV_decomp: () -> [Matrix, Matrix, untyped]
35
+ def is_a?: (untyped) -> bool
36
+ def respond_to?: (Symbol) -> bool
26
37
  end
@@ -0,0 +1,14 @@
1
+ # Type stubs for Streaming module
2
+ # Defines the interface that including classes must implement
3
+
4
+ module Classifier
5
+ # Interface for classes that include Streaming
6
+ interface _StreamingHost
7
+ def storage: () -> Storage::Base?
8
+ def storage=: (Storage::Base?) -> void
9
+ def save: () -> void
10
+ end
11
+
12
+ module Streaming : _StreamingHost
13
+ end
14
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lucas Carlson
@@ -121,7 +121,10 @@ dependencies:
121
121
  - - ">="
122
122
  - !ruby/object:Gem::Version
123
123
  version: '0'
124
- description: A general classifier module to allow Bayesian and other types of classifications.
124
+ description: A Ruby library for text classification featuring Naive Bayes, LSI (Latent
125
+ Semantic Indexing), Logistic Regression, and k-Nearest Neighbors classifiers. Includes
126
+ TF-IDF vectorization, streaming/incremental training, pluggable persistence backends,
127
+ thread safety, and a native C extension for fast LSI operations.
125
128
  email: lucas@rufy.com
126
129
  executables: []
127
130
  extensions:
@@ -135,6 +138,7 @@ files:
135
138
  - bin/summarize.rb
136
139
  - ext/classifier/classifier_ext.c
137
140
  - ext/classifier/extconf.rb
141
+ - ext/classifier/incremental_svd.c
138
142
  - ext/classifier/linalg.h
139
143
  - ext/classifier/matrix.c
140
144
  - ext/classifier/svd.c
@@ -145,19 +149,27 @@ files:
145
149
  - lib/classifier/extensions/string.rb
146
150
  - lib/classifier/extensions/vector.rb
147
151
  - lib/classifier/extensions/word_hash.rb
152
+ - lib/classifier/knn.rb
153
+ - lib/classifier/logistic_regression.rb
148
154
  - lib/classifier/lsi.rb
149
155
  - lib/classifier/lsi/content_node.rb
156
+ - lib/classifier/lsi/incremental_svd.rb
150
157
  - lib/classifier/lsi/summary.rb
151
158
  - lib/classifier/lsi/word_list.rb
152
159
  - lib/classifier/storage.rb
153
160
  - lib/classifier/storage/base.rb
154
161
  - lib/classifier/storage/file.rb
155
162
  - lib/classifier/storage/memory.rb
163
+ - lib/classifier/streaming.rb
164
+ - lib/classifier/streaming/line_reader.rb
165
+ - lib/classifier/streaming/progress.rb
166
+ - lib/classifier/tfidf.rb
156
167
  - sig/vendor/fast_stemmer.rbs
157
168
  - sig/vendor/gsl.rbs
158
169
  - sig/vendor/json.rbs
159
170
  - sig/vendor/matrix.rbs
160
171
  - sig/vendor/mutex_m.rbs
172
+ - sig/vendor/streaming.rbs
161
173
  - test/test_helper.rb
162
174
  homepage: https://rubyclassifier.com
163
175
  licenses:
@@ -174,7 +186,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
174
186
  requirements:
175
187
  - - ">="
176
188
  - !ruby/object:Gem::Version
177
- version: '0'
189
+ version: '3.1'
178
190
  required_rubygems_version: !ruby/object:Gem::Requirement
179
191
  requirements:
180
192
  - - ">="
@@ -183,5 +195,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
183
195
  requirements: []
184
196
  rubygems_version: 4.0.3
185
197
  specification_version: 4
186
- summary: A general classifier module to allow Bayesian and other types of classifications.
198
+ summary: Text classification with Bayesian, LSI, Logistic Regression, kNN, and TF-IDF
199
+ vectorization.
187
200
  test_files: []