classifier 2.0.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +23 -13
- data/README.md +72 -190
- data/ext/classifier/classifier_ext.c +26 -0
- data/ext/classifier/extconf.rb +15 -0
- data/ext/classifier/incremental_svd.c +393 -0
- data/ext/classifier/linalg.h +72 -0
- data/ext/classifier/matrix.c +387 -0
- data/ext/classifier/svd.c +208 -0
- data/ext/classifier/vector.c +319 -0
- data/lib/classifier/bayes.rb +398 -54
- data/lib/classifier/errors.rb +19 -0
- data/lib/classifier/extensions/vector.rb +12 -4
- data/lib/classifier/knn.rb +351 -0
- data/lib/classifier/logistic_regression.rb +571 -0
- data/lib/classifier/lsi/content_node.rb +5 -5
- data/lib/classifier/lsi/incremental_svd.rb +166 -0
- data/lib/classifier/lsi/summary.rb +25 -5
- data/lib/classifier/lsi.rb +784 -138
- data/lib/classifier/storage/base.rb +50 -0
- data/lib/classifier/storage/file.rb +51 -0
- data/lib/classifier/storage/memory.rb +49 -0
- data/lib/classifier/storage.rb +9 -0
- data/lib/classifier/streaming/line_reader.rb +99 -0
- data/lib/classifier/streaming/progress.rb +96 -0
- data/lib/classifier/streaming.rb +122 -0
- data/lib/classifier/tfidf.rb +408 -0
- data/lib/classifier.rb +6 -0
- data/sig/vendor/json.rbs +4 -0
- data/sig/vendor/matrix.rbs +25 -14
- data/sig/vendor/mutex_m.rbs +16 -0
- data/sig/vendor/streaming.rbs +14 -0
- data/test/test_helper.rb +2 -0
- metadata +52 -8
- data/lib/classifier/extensions/vector_serialize.rb +0 -18
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
|
4
|
+
# Copyright:: Copyright (c) 2024 Lucas Carlson
|
|
5
|
+
# License:: LGPL
|
|
6
|
+
|
|
7
|
+
require 'json'
|
|
8
|
+
|
|
9
|
+
module Classifier
|
|
10
|
+
# TF-IDF vectorizer: transforms text to weighted feature vectors.
|
|
11
|
+
# Downweights common words, upweights discriminative terms.
|
|
12
|
+
#
|
|
13
|
+
# Example:
|
|
14
|
+
# tfidf = Classifier::TFIDF.new
|
|
15
|
+
# tfidf.fit(["Dogs are great pets", "Cats are independent"])
|
|
16
|
+
# tfidf.transform("Dogs are loyal") # => {:dog=>0.7071..., :loyal=>0.7071...}
|
|
17
|
+
#
|
|
18
|
+
class TFIDF
|
|
19
|
+
include Streaming
|
|
20
|
+
|
|
21
|
+
# @rbs @min_df: Integer | Float
|
|
22
|
+
# @rbs @max_df: Integer | Float
|
|
23
|
+
# @rbs @ngram_range: Array[Integer]
|
|
24
|
+
# @rbs @sublinear_tf: bool
|
|
25
|
+
# @rbs @vocabulary: Hash[Symbol, Integer]
|
|
26
|
+
# @rbs @idf: Hash[Symbol, Float]
|
|
27
|
+
# @rbs @num_documents: Integer
|
|
28
|
+
# @rbs @fitted: bool
|
|
29
|
+
# @rbs @dirty: bool
|
|
30
|
+
# @rbs @storage: Storage::Base?
|
|
31
|
+
|
|
32
|
+
attr_reader :vocabulary, :idf, :num_documents
|
|
33
|
+
attr_accessor :storage
|
|
34
|
+
|
|
35
|
+
# Creates a new TF-IDF vectorizer.
|
|
36
|
+
# - min_df/max_df: filter terms by document frequency (Integer for count, Float for proportion)
|
|
37
|
+
# - ngram_range: [1,1] for unigrams, [1,2] for unigrams+bigrams
|
|
38
|
+
# - sublinear_tf: use 1 + log(tf) instead of raw term frequency
|
|
39
|
+
#
|
|
40
|
+
# @rbs (?min_df: Integer | Float, ?max_df: Integer | Float,
|
|
41
|
+
# ?ngram_range: Array[Integer], ?sublinear_tf: bool) -> void
|
|
42
|
+
def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false)
|
|
43
|
+
validate_df!(min_df, 'min_df')
|
|
44
|
+
validate_df!(max_df, 'max_df')
|
|
45
|
+
validate_ngram_range!(ngram_range)
|
|
46
|
+
|
|
47
|
+
@min_df = min_df
|
|
48
|
+
@max_df = max_df
|
|
49
|
+
@ngram_range = ngram_range
|
|
50
|
+
@sublinear_tf = sublinear_tf
|
|
51
|
+
@vocabulary = {}
|
|
52
|
+
@idf = {}
|
|
53
|
+
@num_documents = 0
|
|
54
|
+
@fitted = false
|
|
55
|
+
@dirty = false
|
|
56
|
+
@storage = nil
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Learns vocabulary and IDF weights from the corpus.
|
|
60
|
+
# @rbs (Array[String]) -> self
|
|
61
|
+
def fit(documents)
|
|
62
|
+
raise ArgumentError, 'documents must be an array' unless documents.is_a?(Array)
|
|
63
|
+
raise ArgumentError, 'documents cannot be empty' if documents.empty?
|
|
64
|
+
|
|
65
|
+
@num_documents = documents.size
|
|
66
|
+
document_frequencies = Hash.new(0)
|
|
67
|
+
|
|
68
|
+
documents.each do |doc|
|
|
69
|
+
terms = extract_terms(doc)
|
|
70
|
+
terms.each_key { |term| document_frequencies[term] += 1 }
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
@vocabulary = {}
|
|
74
|
+
@idf = {}
|
|
75
|
+
vocab_index = 0
|
|
76
|
+
|
|
77
|
+
document_frequencies.each do |term, df|
|
|
78
|
+
next unless within_df_bounds?(df, @num_documents)
|
|
79
|
+
|
|
80
|
+
@vocabulary[term] = vocab_index
|
|
81
|
+
vocab_index += 1
|
|
82
|
+
|
|
83
|
+
# IDF: log((N + 1) / (df + 1)) + 1
|
|
84
|
+
@idf[term] = Math.log((@num_documents + 1).to_f / (df + 1)) + 1
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
@fitted = true
|
|
88
|
+
@dirty = true
|
|
89
|
+
self
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Transforms a document into a normalized TF-IDF vector.
|
|
93
|
+
# @rbs (String) -> Hash[Symbol, Float]
|
|
94
|
+
def transform(document)
|
|
95
|
+
raise NotFittedError, 'TFIDF has not been fitted. Call fit first.' unless @fitted
|
|
96
|
+
|
|
97
|
+
terms = extract_terms(document)
|
|
98
|
+
result = {} #: Hash[Symbol, Float]
|
|
99
|
+
|
|
100
|
+
terms.each do |term, tf|
|
|
101
|
+
next unless @vocabulary.key?(term)
|
|
102
|
+
|
|
103
|
+
tf_value = @sublinear_tf && tf.positive? ? 1 + Math.log(tf) : tf.to_f
|
|
104
|
+
result[term] = (tf_value * @idf[term]).to_f
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
normalize_vector(result)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Fits and transforms in one step.
|
|
111
|
+
# @rbs (Array[String]) -> Array[Hash[Symbol, Float]]
|
|
112
|
+
def fit_transform(documents)
|
|
113
|
+
fit(documents)
|
|
114
|
+
documents.map { |doc| transform(doc) }
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Returns vocabulary terms in index order.
|
|
118
|
+
# @rbs () -> Array[Symbol]
|
|
119
|
+
def feature_names
|
|
120
|
+
@vocabulary.keys.sort_by { |term| @vocabulary[term] }
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# @rbs () -> bool
|
|
124
|
+
def fitted?
|
|
125
|
+
@fitted
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Returns true if there are unsaved changes.
|
|
129
|
+
# @rbs () -> bool
|
|
130
|
+
def dirty?
|
|
131
|
+
@dirty
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Saves the vectorizer to the configured storage.
|
|
135
|
+
# @rbs () -> void
|
|
136
|
+
def save
|
|
137
|
+
raise ArgumentError, 'No storage configured' unless storage
|
|
138
|
+
|
|
139
|
+
storage.write(to_json)
|
|
140
|
+
@dirty = false
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Saves the vectorizer state to a file.
|
|
144
|
+
# @rbs (String) -> Integer
|
|
145
|
+
def save_to_file(path)
|
|
146
|
+
result = File.write(path, to_json)
|
|
147
|
+
@dirty = false
|
|
148
|
+
result
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Loads a vectorizer from the configured storage.
|
|
152
|
+
# @rbs (storage: Storage::Base) -> TFIDF
|
|
153
|
+
def self.load(storage:)
|
|
154
|
+
data = storage.read
|
|
155
|
+
raise StorageError, 'No saved state found' unless data
|
|
156
|
+
|
|
157
|
+
instance = from_json(data)
|
|
158
|
+
instance.storage = storage
|
|
159
|
+
instance
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Loads a vectorizer from a file.
|
|
163
|
+
# @rbs (String) -> TFIDF
|
|
164
|
+
def self.load_from_file(path)
|
|
165
|
+
from_json(File.read(path))
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Reloads the vectorizer from storage, raising if there are unsaved changes.
|
|
169
|
+
# @rbs () -> self
|
|
170
|
+
def reload
|
|
171
|
+
raise ArgumentError, 'No storage configured' unless storage
|
|
172
|
+
raise UnsavedChangesError, 'Unsaved changes would be lost. Call save first or use reload!' if @dirty
|
|
173
|
+
|
|
174
|
+
data = storage.read
|
|
175
|
+
raise StorageError, 'No saved state found' unless data
|
|
176
|
+
|
|
177
|
+
restore_from_json(data)
|
|
178
|
+
@dirty = false
|
|
179
|
+
self
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Force reloads the vectorizer from storage, discarding any unsaved changes.
|
|
183
|
+
# @rbs () -> self
|
|
184
|
+
def reload!
|
|
185
|
+
raise ArgumentError, 'No storage configured' unless storage
|
|
186
|
+
|
|
187
|
+
data = storage.read
|
|
188
|
+
raise StorageError, 'No saved state found' unless data
|
|
189
|
+
|
|
190
|
+
restore_from_json(data)
|
|
191
|
+
@dirty = false
|
|
192
|
+
self
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# @rbs (?untyped) -> Hash[Symbol, untyped]
|
|
196
|
+
def as_json(_options = nil)
|
|
197
|
+
{
|
|
198
|
+
version: 1,
|
|
199
|
+
type: 'tfidf',
|
|
200
|
+
min_df: @min_df,
|
|
201
|
+
max_df: @max_df,
|
|
202
|
+
ngram_range: @ngram_range,
|
|
203
|
+
sublinear_tf: @sublinear_tf,
|
|
204
|
+
vocabulary: @vocabulary,
|
|
205
|
+
idf: @idf,
|
|
206
|
+
num_documents: @num_documents,
|
|
207
|
+
fitted: @fitted
|
|
208
|
+
}
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# @rbs (?untyped) -> String
|
|
212
|
+
def to_json(_options = nil)
|
|
213
|
+
JSON.generate(as_json)
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Loads a vectorizer from JSON.
|
|
217
|
+
# @rbs (String | Hash[String, untyped]) -> TFIDF
|
|
218
|
+
def self.from_json(json)
|
|
219
|
+
data = json.is_a?(String) ? JSON.parse(json) : json
|
|
220
|
+
raise ArgumentError, "Invalid vectorizer type: #{data['type']}" unless data['type'] == 'tfidf'
|
|
221
|
+
|
|
222
|
+
instance = new(
|
|
223
|
+
min_df: data['min_df'],
|
|
224
|
+
max_df: data['max_df'],
|
|
225
|
+
ngram_range: data['ngram_range'],
|
|
226
|
+
sublinear_tf: data['sublinear_tf']
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
instance.instance_variable_set(:@vocabulary, symbolize_keys(data['vocabulary']))
|
|
230
|
+
instance.instance_variable_set(:@idf, symbolize_keys(data['idf']))
|
|
231
|
+
instance.instance_variable_set(:@num_documents, data['num_documents'])
|
|
232
|
+
instance.instance_variable_set(:@fitted, data['fitted'])
|
|
233
|
+
instance.instance_variable_set(:@dirty, false)
|
|
234
|
+
instance.instance_variable_set(:@storage, nil)
|
|
235
|
+
|
|
236
|
+
instance
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
# @rbs () -> Array[untyped]
|
|
240
|
+
def marshal_dump
|
|
241
|
+
[@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted]
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# @rbs (Array[untyped]) -> void
|
|
245
|
+
def marshal_load(data)
|
|
246
|
+
@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted = data
|
|
247
|
+
@dirty = false
|
|
248
|
+
@storage = nil
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Loads a vectorizer from a checkpoint.
|
|
252
|
+
#
|
|
253
|
+
# @rbs (storage: Storage::Base, checkpoint_id: String) -> TFIDF
|
|
254
|
+
def self.load_checkpoint(storage:, checkpoint_id:)
|
|
255
|
+
raise ArgumentError, 'Storage must be File storage for checkpoints' unless storage.is_a?(Storage::File)
|
|
256
|
+
|
|
257
|
+
dir = File.dirname(storage.path)
|
|
258
|
+
base = File.basename(storage.path, '.*')
|
|
259
|
+
ext = File.extname(storage.path)
|
|
260
|
+
checkpoint_path = File.join(dir, "#{base}_checkpoint_#{checkpoint_id}#{ext}")
|
|
261
|
+
|
|
262
|
+
checkpoint_storage = Storage::File.new(path: checkpoint_path)
|
|
263
|
+
instance = load(storage: checkpoint_storage)
|
|
264
|
+
instance.storage = storage
|
|
265
|
+
instance
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
# Fits the vectorizer from an IO stream.
|
|
269
|
+
# Collects all documents from the stream, then fits the model.
|
|
270
|
+
# Note: All documents must be collected in memory for IDF calculation.
|
|
271
|
+
#
|
|
272
|
+
# @example Fit from a file
|
|
273
|
+
# tfidf.fit_from_stream(File.open('corpus.txt'))
|
|
274
|
+
#
|
|
275
|
+
# @example With progress tracking
|
|
276
|
+
# tfidf.fit_from_stream(io, batch_size: 500) do |progress|
|
|
277
|
+
# puts "#{progress.completed} documents loaded"
|
|
278
|
+
# end
|
|
279
|
+
#
|
|
280
|
+
# @rbs (IO, ?batch_size: Integer) { (Streaming::Progress) -> void } -> self
|
|
281
|
+
def fit_from_stream(io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
|
|
282
|
+
reader = Streaming::LineReader.new(io, batch_size: batch_size)
|
|
283
|
+
total = reader.estimate_line_count
|
|
284
|
+
progress = Streaming::Progress.new(total: total)
|
|
285
|
+
|
|
286
|
+
documents = [] #: Array[String]
|
|
287
|
+
|
|
288
|
+
reader.each_batch do |batch|
|
|
289
|
+
documents.concat(batch)
|
|
290
|
+
progress.completed += batch.size
|
|
291
|
+
progress.current_batch += 1
|
|
292
|
+
yield progress if block_given?
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
fit(documents) unless documents.empty?
|
|
296
|
+
self
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
# TFIDF doesn't support train_from_stream (use fit_from_stream instead).
|
|
300
|
+
# This method raises NotImplementedError with guidance.
|
|
301
|
+
#
|
|
302
|
+
# @rbs (*untyped, **untyped) -> void
|
|
303
|
+
def train_from_stream(*) # steep:ignore
|
|
304
|
+
raise NotImplementedError, 'TFIDF uses fit_from_stream instead of train_from_stream'
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
# TFIDF doesn't support train_batch (use fit instead).
|
|
308
|
+
# This method raises NotImplementedError with guidance.
|
|
309
|
+
#
|
|
310
|
+
# @rbs (*untyped, **untyped) -> void
|
|
311
|
+
def train_batch(*) # steep:ignore
|
|
312
|
+
raise NotImplementedError, 'TFIDF uses fit instead of train_batch'
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
private
|
|
316
|
+
|
|
317
|
+
# Restores vectorizer state from JSON string.
|
|
318
|
+
# @rbs (String) -> void
|
|
319
|
+
def restore_from_json(json)
|
|
320
|
+
data = JSON.parse(json)
|
|
321
|
+
|
|
322
|
+
@min_df = data['min_df']
|
|
323
|
+
@max_df = data['max_df']
|
|
324
|
+
@ngram_range = data['ngram_range']
|
|
325
|
+
@sublinear_tf = data['sublinear_tf']
|
|
326
|
+
@vocabulary = self.class.send(:symbolize_keys, data['vocabulary'])
|
|
327
|
+
@idf = self.class.send(:symbolize_keys, data['idf'])
|
|
328
|
+
@num_documents = data['num_documents']
|
|
329
|
+
@fitted = data['fitted']
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
# @rbs (String) -> Hash[Symbol, Integer]
|
|
333
|
+
def extract_terms(document)
|
|
334
|
+
result = Hash.new(0)
|
|
335
|
+
|
|
336
|
+
if @ngram_range[0] <= 1
|
|
337
|
+
word_hash = document.clean_word_hash
|
|
338
|
+
word_hash.each { |term, count| result[term] += count }
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
return result if @ngram_range[1] <= 1
|
|
342
|
+
|
|
343
|
+
tokens = tokenize_for_ngrams(document)
|
|
344
|
+
(2..@ngram_range[1]).each do |n|
|
|
345
|
+
next if n < @ngram_range[0]
|
|
346
|
+
|
|
347
|
+
generate_ngrams(tokens, n).each { |ngram| result[ngram] += 1 }
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
result
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
# @rbs (String) -> Array[String]
|
|
354
|
+
def tokenize_for_ngrams(document)
|
|
355
|
+
document
|
|
356
|
+
.gsub(/[^\w\s]/, '')
|
|
357
|
+
.split
|
|
358
|
+
.map(&:downcase)
|
|
359
|
+
.reject { |w| w.length <= 2 || String::CORPUS_SKIP_WORDS.include?(w) }
|
|
360
|
+
.map(&:stem)
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
# @rbs (Array[String], Integer) -> Array[Symbol]
|
|
364
|
+
def generate_ngrams(tokens, n) # rubocop:disable Naming/MethodParameterName
|
|
365
|
+
return [] if tokens.size < n
|
|
366
|
+
|
|
367
|
+
tokens.each_cons(n).map { |gram| gram.join('_').intern }
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
# @rbs (Integer, Integer) -> bool
|
|
371
|
+
def within_df_bounds?(doc_freq, num_docs)
|
|
372
|
+
doc_freq.between?(
|
|
373
|
+
@min_df.is_a?(Float) ? (@min_df * num_docs).ceil : @min_df,
|
|
374
|
+
@max_df.is_a?(Float) ? (@max_df * num_docs).floor : @max_df
|
|
375
|
+
)
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
# @rbs (Hash[Symbol, Float]) -> Hash[Symbol, Float]
|
|
379
|
+
def normalize_vector(vector)
|
|
380
|
+
return vector if vector.empty?
|
|
381
|
+
|
|
382
|
+
magnitude = Math.sqrt(vector.values.sum { |v| v * v })
|
|
383
|
+
return vector if magnitude.zero?
|
|
384
|
+
|
|
385
|
+
vector.transform_values { |v| v / magnitude }
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
# @rbs (Integer | Float, String) -> void
|
|
389
|
+
def validate_df!(value, name)
|
|
390
|
+
raise ArgumentError, "#{name} must be an Integer or Float" unless value.is_a?(Float) || value.is_a?(Integer)
|
|
391
|
+
raise ArgumentError, "#{name} must be between 0.0 and 1.0" if value.is_a?(Float) && !value.between?(0.0, 1.0)
|
|
392
|
+
raise ArgumentError, "#{name} must be non-negative" if value.is_a?(Integer) && value.negative?
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
# @rbs (Array[Integer]) -> void
|
|
396
|
+
def validate_ngram_range!(range)
|
|
397
|
+
raise ArgumentError, 'ngram_range must be an array of two integers' unless range.is_a?(Array) && range.size == 2
|
|
398
|
+
raise ArgumentError, 'ngram_range values must be positive integers' unless range.all?(Integer) && range.all?(&:positive?)
|
|
399
|
+
raise ArgumentError, 'ngram_range[0] must be <= ngram_range[1]' if range[0] > range[1]
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
# @rbs (Hash[String, untyped]) -> Hash[Symbol, untyped]
|
|
403
|
+
def self.symbolize_keys(hash)
|
|
404
|
+
hash.transform_keys(&:to_sym)
|
|
405
|
+
end
|
|
406
|
+
private_class_method :symbolize_keys
|
|
407
|
+
end
|
|
408
|
+
end
|
data/lib/classifier.rb
CHANGED
|
@@ -25,7 +25,13 @@
|
|
|
25
25
|
# License:: LGPL
|
|
26
26
|
|
|
27
27
|
require 'rubygems'
|
|
28
|
+
require 'classifier/errors'
|
|
29
|
+
require 'classifier/storage'
|
|
30
|
+
require 'classifier/streaming'
|
|
28
31
|
require 'classifier/extensions/string'
|
|
29
32
|
require 'classifier/extensions/vector'
|
|
30
33
|
require 'classifier/bayes'
|
|
31
34
|
require 'classifier/lsi'
|
|
35
|
+
require 'classifier/knn'
|
|
36
|
+
require 'classifier/tfidf'
|
|
37
|
+
require 'classifier/logistic_regression'
|
data/sig/vendor/json.rbs
ADDED
data/sig/vendor/matrix.rbs
CHANGED
|
@@ -1,26 +1,37 @@
|
|
|
1
1
|
# Type stubs for matrix gem
|
|
2
|
-
|
|
2
|
+
# Using untyped elements since our usage is primarily with Floats/Numerics
|
|
3
|
+
class Vector
|
|
3
4
|
EPSILON: Float
|
|
4
5
|
|
|
5
|
-
def self.[]:
|
|
6
|
+
def self.[]: (*untyped) -> Vector
|
|
6
7
|
def size: () -> Integer
|
|
7
|
-
def []: (Integer) ->
|
|
8
|
+
def []: (Integer) -> untyped
|
|
8
9
|
def magnitude: () -> Float
|
|
9
|
-
def normalize: () -> Vector
|
|
10
|
-
def each: () { (
|
|
11
|
-
def collect:
|
|
12
|
-
def to_a: () -> Array[
|
|
10
|
+
def normalize: () -> Vector
|
|
11
|
+
def each: () { (untyped) -> void } -> void
|
|
12
|
+
def collect: () { (untyped) -> untyped } -> Vector
|
|
13
|
+
def to_a: () -> Array[untyped]
|
|
13
14
|
def *: (untyped) -> untyped
|
|
15
|
+
def -: (Vector) -> Vector
|
|
16
|
+
def is_a?: (untyped) -> bool
|
|
14
17
|
end
|
|
15
18
|
|
|
16
|
-
class Matrix
|
|
17
|
-
def self.rows:
|
|
18
|
-
def self.[]:
|
|
19
|
-
def self.diag: (untyped) -> Matrix
|
|
20
|
-
def
|
|
19
|
+
class Matrix
|
|
20
|
+
def self.rows: (Array[Array[untyped]]) -> Matrix
|
|
21
|
+
def self.[]: (*Array[untyped]) -> Matrix
|
|
22
|
+
def self.diag: (untyped) -> Matrix
|
|
23
|
+
def self.columns: (Array[Array[untyped]]) -> Matrix
|
|
24
|
+
def self.empty: (Integer, Integer) -> Matrix
|
|
25
|
+
def self.zero: (Integer, Integer) -> Matrix
|
|
26
|
+
def self.vstack: (Matrix, Matrix) -> Matrix
|
|
27
|
+
def trans: () -> Matrix
|
|
28
|
+
def transpose: () -> Matrix
|
|
21
29
|
def *: (untyped) -> untyped
|
|
22
30
|
def row_size: () -> Integer
|
|
23
31
|
def column_size: () -> Integer
|
|
24
|
-
def
|
|
25
|
-
def
|
|
32
|
+
def row: (Integer) -> Vector
|
|
33
|
+
def column: (Integer) -> Vector
|
|
34
|
+
def SV_decomp: () -> [Matrix, Matrix, untyped]
|
|
35
|
+
def is_a?: (untyped) -> bool
|
|
36
|
+
def respond_to?: (Symbol) -> bool
|
|
26
37
|
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Type stubs for mutex_m gem
|
|
2
|
+
module Mutex_m
|
|
3
|
+
def mu_initialize: () -> void
|
|
4
|
+
def mu_lock: () -> void
|
|
5
|
+
def mu_unlock: () -> void
|
|
6
|
+
def mu_synchronize: [T] () { () -> T } -> T
|
|
7
|
+
def mu_try_lock: () -> bool
|
|
8
|
+
def mu_locked?: () -> bool
|
|
9
|
+
|
|
10
|
+
# Aliases
|
|
11
|
+
alias lock mu_lock
|
|
12
|
+
alias unlock mu_unlock
|
|
13
|
+
alias synchronize mu_synchronize
|
|
14
|
+
alias try_lock mu_try_lock
|
|
15
|
+
alias locked? mu_locked?
|
|
16
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Type stubs for Streaming module
|
|
2
|
+
# Defines the interface that including classes must implement
|
|
3
|
+
|
|
4
|
+
module Classifier
|
|
5
|
+
# Interface for classes that include Streaming
|
|
6
|
+
interface _StreamingHost
|
|
7
|
+
def storage: () -> Storage::Base?
|
|
8
|
+
def storage=: (Storage::Base?) -> void
|
|
9
|
+
def save: () -> void
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
module Streaming : _StreamingHost
|
|
13
|
+
end
|
|
14
|
+
end
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: classifier
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Lucas Carlson
|
|
@@ -107,10 +107,28 @@ dependencies:
|
|
|
107
107
|
- - ">="
|
|
108
108
|
- !ruby/object:Gem::Version
|
|
109
109
|
version: '0'
|
|
110
|
-
|
|
110
|
+
- !ruby/object:Gem::Dependency
|
|
111
|
+
name: rake-compiler
|
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
|
113
|
+
requirements:
|
|
114
|
+
- - ">="
|
|
115
|
+
- !ruby/object:Gem::Version
|
|
116
|
+
version: '0'
|
|
117
|
+
type: :development
|
|
118
|
+
prerelease: false
|
|
119
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
120
|
+
requirements:
|
|
121
|
+
- - ">="
|
|
122
|
+
- !ruby/object:Gem::Version
|
|
123
|
+
version: '0'
|
|
124
|
+
description: A Ruby library for text classification featuring Naive Bayes, LSI (Latent
|
|
125
|
+
Semantic Indexing), Logistic Regression, and k-Nearest Neighbors classifiers. Includes
|
|
126
|
+
TF-IDF vectorization, streaming/incremental training, pluggable persistence backends,
|
|
127
|
+
thread safety, and a native C extension for fast LSI operations.
|
|
111
128
|
email: lucas@rufy.com
|
|
112
129
|
executables: []
|
|
113
|
-
extensions:
|
|
130
|
+
extensions:
|
|
131
|
+
- ext/classifier/extconf.rb
|
|
114
132
|
extra_rdoc_files: []
|
|
115
133
|
files:
|
|
116
134
|
- CLAUDE.md
|
|
@@ -118,24 +136,49 @@ files:
|
|
|
118
136
|
- README.md
|
|
119
137
|
- bin/bayes.rb
|
|
120
138
|
- bin/summarize.rb
|
|
139
|
+
- ext/classifier/classifier_ext.c
|
|
140
|
+
- ext/classifier/extconf.rb
|
|
141
|
+
- ext/classifier/incremental_svd.c
|
|
142
|
+
- ext/classifier/linalg.h
|
|
143
|
+
- ext/classifier/matrix.c
|
|
144
|
+
- ext/classifier/svd.c
|
|
145
|
+
- ext/classifier/vector.c
|
|
121
146
|
- lib/classifier.rb
|
|
122
147
|
- lib/classifier/bayes.rb
|
|
148
|
+
- lib/classifier/errors.rb
|
|
123
149
|
- lib/classifier/extensions/string.rb
|
|
124
150
|
- lib/classifier/extensions/vector.rb
|
|
125
|
-
- lib/classifier/extensions/vector_serialize.rb
|
|
126
151
|
- lib/classifier/extensions/word_hash.rb
|
|
152
|
+
- lib/classifier/knn.rb
|
|
153
|
+
- lib/classifier/logistic_regression.rb
|
|
127
154
|
- lib/classifier/lsi.rb
|
|
128
155
|
- lib/classifier/lsi/content_node.rb
|
|
156
|
+
- lib/classifier/lsi/incremental_svd.rb
|
|
129
157
|
- lib/classifier/lsi/summary.rb
|
|
130
158
|
- lib/classifier/lsi/word_list.rb
|
|
159
|
+
- lib/classifier/storage.rb
|
|
160
|
+
- lib/classifier/storage/base.rb
|
|
161
|
+
- lib/classifier/storage/file.rb
|
|
162
|
+
- lib/classifier/storage/memory.rb
|
|
163
|
+
- lib/classifier/streaming.rb
|
|
164
|
+
- lib/classifier/streaming/line_reader.rb
|
|
165
|
+
- lib/classifier/streaming/progress.rb
|
|
166
|
+
- lib/classifier/tfidf.rb
|
|
131
167
|
- sig/vendor/fast_stemmer.rbs
|
|
132
168
|
- sig/vendor/gsl.rbs
|
|
169
|
+
- sig/vendor/json.rbs
|
|
133
170
|
- sig/vendor/matrix.rbs
|
|
171
|
+
- sig/vendor/mutex_m.rbs
|
|
172
|
+
- sig/vendor/streaming.rbs
|
|
134
173
|
- test/test_helper.rb
|
|
135
|
-
homepage: https://
|
|
174
|
+
homepage: https://rubyclassifier.com
|
|
136
175
|
licenses:
|
|
137
176
|
- LGPL
|
|
138
|
-
metadata:
|
|
177
|
+
metadata:
|
|
178
|
+
documentation_uri: https://rubyclassifier.com/docs
|
|
179
|
+
source_code_uri: https://github.com/cardmagic/classifier
|
|
180
|
+
bug_tracker_uri: https://github.com/cardmagic/classifier/issues
|
|
181
|
+
changelog_uri: https://github.com/cardmagic/classifier/releases
|
|
139
182
|
rdoc_options: []
|
|
140
183
|
require_paths:
|
|
141
184
|
- lib
|
|
@@ -143,7 +186,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
143
186
|
requirements:
|
|
144
187
|
- - ">="
|
|
145
188
|
- !ruby/object:Gem::Version
|
|
146
|
-
version: '
|
|
189
|
+
version: '3.1'
|
|
147
190
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
148
191
|
requirements:
|
|
149
192
|
- - ">="
|
|
@@ -152,5 +195,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
152
195
|
requirements: []
|
|
153
196
|
rubygems_version: 4.0.3
|
|
154
197
|
specification_version: 4
|
|
155
|
-
summary:
|
|
198
|
+
summary: Text classification with Bayesian, LSI, Logistic Regression, kNN, and TF-IDF
|
|
199
|
+
vectorization.
|
|
156
200
|
test_files: []
|