classifier 2.3.2 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +21 -5
- data/lib/classifier/bayes.rb +39 -21
- data/lib/classifier/config.rb +31 -0
- data/lib/classifier/extensions/word_hash.rb +9 -9
- data/lib/classifier/knn.rb +4 -3
- data/lib/classifier/logistic_regression.rb +51 -30
- data/lib/classifier/lsi.rb +42 -17
- data/lib/classifier/streaming.rb +2 -2
- data/lib/classifier/tfidf.rb +15 -7
- data/lib/classifier/version.rb +1 -1
- data/lib/classifier.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 539130382c7ce45072e515ee9817f47fb1144af18498a81c27c7dda842477141
|
|
4
|
+
data.tar.gz: 1537f2c7c164ec70c14c7b314148fa81cfb947a93dfe328125082567c5a99a7e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3bf1385063c020bb08097192417232a6dc3fce7a16a057bec27ccf62253d69809fedc692f597b7ab6c4e36e8236629188e37add798d883ed670a55cb02c37023
|
|
7
|
+
data.tar.gz: '0528f07ae03ab30b752c3242b747058956831900c55c1c5a858ab623525075806a52fc8b66236de0f4b959ce678f9a24204887434b19c8662860d06173b2634f'
|
data/README.md
CHANGED
|
@@ -27,7 +27,7 @@ gem 'classifier'
|
|
|
27
27
|
Or install via Homebrew for CLI-only usage:
|
|
28
28
|
|
|
29
29
|
```bash
|
|
30
|
-
brew install classifier
|
|
30
|
+
brew install cardmagic/tap/classifier
|
|
31
31
|
```
|
|
32
32
|
|
|
33
33
|
## Command Line
|
|
@@ -36,15 +36,15 @@ Classify text instantly with pre-trained models—no coding required:
|
|
|
36
36
|
|
|
37
37
|
```bash
|
|
38
38
|
# Detect spam
|
|
39
|
-
classifier
|
|
39
|
+
classifier -r sms-spam-filter "You won a free iPhone"
|
|
40
40
|
# => spam
|
|
41
41
|
|
|
42
42
|
# Analyze sentiment
|
|
43
|
-
classifier
|
|
43
|
+
classifier -r imdb-sentiment "This movie was absolutely amazing"
|
|
44
44
|
# => positive
|
|
45
45
|
|
|
46
46
|
# Detect emotions
|
|
47
|
-
classifier
|
|
47
|
+
classifier -r emotion-detection "I am so happy today"
|
|
48
48
|
# => joy
|
|
49
49
|
|
|
50
50
|
# List all available models
|
|
@@ -59,12 +59,28 @@ classifier train positive reviews/good/*.txt
|
|
|
59
59
|
classifier train negative reviews/bad/*.txt
|
|
60
60
|
|
|
61
61
|
# Classify new text
|
|
62
|
-
classifier
|
|
62
|
+
classifier "Great product, highly recommend"
|
|
63
63
|
# => positive
|
|
64
64
|
```
|
|
65
65
|
|
|
66
66
|
[CLI Guide →](https://rubyclassifier.com/docs/guides/cli/basics)
|
|
67
67
|
|
|
68
|
+
### Claude Code Plugin
|
|
69
|
+
|
|
70
|
+
Install as a plugin to get skills (auto-invoked) and slash commands:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
# Add the marketplace
|
|
74
|
+
claude plugin marketplace add cardmagic/ai-marketplace
|
|
75
|
+
|
|
76
|
+
# Install the plugin
|
|
77
|
+
claude plugin install classifier@cardmagic
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
This gives you:
|
|
81
|
+
- **Skill**: Claude automatically classifies text when you ask about spam, sentiment, or emotions
|
|
82
|
+
- **Slash commands**: `/classifier:classify`, `/classifier:train`, `/classifier:models`
|
|
83
|
+
|
|
68
84
|
## Quick Start
|
|
69
85
|
|
|
70
86
|
### Bayesian
|
data/lib/classifier/bayes.rb
CHANGED
|
@@ -20,6 +20,7 @@ module Classifier
|
|
|
20
20
|
# @rbs @cached_vocab_size: Integer?
|
|
21
21
|
# @rbs @dirty: bool
|
|
22
22
|
# @rbs @storage: Storage::Base?
|
|
23
|
+
# @rbs @min_word_length: Integer
|
|
23
24
|
|
|
24
25
|
attr_accessor :storage
|
|
25
26
|
|
|
@@ -27,8 +28,9 @@ module Classifier
|
|
|
27
28
|
# initialized and given a training method. E.g.,
|
|
28
29
|
# b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
|
29
30
|
# b = Classifier::Bayes.new ['Interesting', 'Uninteresting', 'Spam']
|
|
30
|
-
#
|
|
31
|
-
|
|
31
|
+
# b = Classifier::Bayes.new 'Spam', min_word_length: 1
|
|
32
|
+
# @rbs (*String | Symbol | Array[String | Symbol], ?min_word_length: Integer) -> void
|
|
33
|
+
def initialize(*categories, min_word_length: Classifier.config.min_word_length)
|
|
32
34
|
super()
|
|
33
35
|
@categories = {}
|
|
34
36
|
categories.flatten.each { |category| @categories[category.prepare_category_name] = {} }
|
|
@@ -39,6 +41,7 @@ module Classifier
|
|
|
39
41
|
@cached_vocab_size = nil
|
|
40
42
|
@dirty = false
|
|
41
43
|
@storage = nil
|
|
44
|
+
@min_word_length = min_word_length
|
|
42
45
|
end
|
|
43
46
|
|
|
44
47
|
# Trains the classifier with text for a category.
|
|
@@ -76,7 +79,7 @@ module Classifier
|
|
|
76
79
|
#
|
|
77
80
|
# @rbs (String) -> Hash[String, Float]
|
|
78
81
|
def classifications(text)
|
|
79
|
-
words = text.word_hash.keys
|
|
82
|
+
words = text.word_hash(@min_word_length).keys
|
|
80
83
|
synchronize do
|
|
81
84
|
training_count = cached_training_count
|
|
82
85
|
vocab_size = cached_vocab_size
|
|
@@ -117,7 +120,8 @@ module Classifier
|
|
|
117
120
|
categories: @categories.transform_keys(&:to_s).transform_values { |v| v.transform_keys(&:to_s) },
|
|
118
121
|
total_words: @total_words,
|
|
119
122
|
category_counts: @category_counts.transform_keys(&:to_s),
|
|
120
|
-
category_word_count: @category_word_count.transform_keys(&:to_s)
|
|
123
|
+
category_word_count: @category_word_count.transform_keys(&:to_s),
|
|
124
|
+
min_word_length: @min_word_length
|
|
121
125
|
}
|
|
122
126
|
end
|
|
123
127
|
|
|
@@ -324,20 +328,14 @@ module Classifier
|
|
|
324
328
|
# puts "#{progress.completed} documents processed"
|
|
325
329
|
# end
|
|
326
330
|
#
|
|
327
|
-
# @rbs (String | Symbol, IO
|
|
328
|
-
def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
|
|
329
|
-
category
|
|
330
|
-
raise
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
reader.each_batch do |batch|
|
|
337
|
-
train_batch_internal(category, batch)
|
|
338
|
-
progress.completed += batch.size
|
|
339
|
-
progress.current_batch += 1
|
|
340
|
-
yield progress if block_given?
|
|
331
|
+
# @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
|
|
332
|
+
def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
|
|
333
|
+
raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
|
|
334
|
+
raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
|
|
335
|
+
|
|
336
|
+
pairs = category && io ? { category => io } : categories
|
|
337
|
+
pairs.each do |cat, stream|
|
|
338
|
+
stream_train_category(cat, stream, batch_size: batch_size, &)
|
|
341
339
|
end
|
|
342
340
|
end
|
|
343
341
|
|
|
@@ -385,6 +383,25 @@ module Classifier
|
|
|
385
383
|
|
|
386
384
|
private
|
|
387
385
|
|
|
386
|
+
# Trains from an IO stream with a single category.
|
|
387
|
+
# @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
|
|
388
|
+
def stream_train_category(category, io, batch_size:)
|
|
389
|
+
category = category.prepare_category_name
|
|
390
|
+
raise ArgumentError, "No such category: #{category}" unless @categories.key?(category)
|
|
391
|
+
raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
|
|
392
|
+
|
|
393
|
+
reader = Streaming::LineReader.new(io, batch_size: batch_size)
|
|
394
|
+
total = reader.estimate_line_count
|
|
395
|
+
progress = Streaming::Progress.new(total: total)
|
|
396
|
+
|
|
397
|
+
reader.each_batch do |batch|
|
|
398
|
+
train_batch_internal(category, batch)
|
|
399
|
+
progress.completed += batch.size
|
|
400
|
+
progress.current_batch += 1
|
|
401
|
+
yield progress if block_given?
|
|
402
|
+
end
|
|
403
|
+
end
|
|
404
|
+
|
|
388
405
|
# Trains a batch of documents for a single category.
|
|
389
406
|
# @rbs (String | Symbol, Array[String], ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
|
|
390
407
|
def train_batch_for_category(category, documents, batch_size: Streaming::DEFAULT_BATCH_SIZE)
|
|
@@ -409,7 +426,7 @@ module Classifier
|
|
|
409
426
|
invalidate_caches
|
|
410
427
|
@dirty = true
|
|
411
428
|
batch.each do |text|
|
|
412
|
-
word_hash = text.word_hash
|
|
429
|
+
word_hash = text.word_hash(@min_word_length)
|
|
413
430
|
@category_counts[category] += 1
|
|
414
431
|
word_hash.each do |word, count|
|
|
415
432
|
@categories[category][word] ||= 0
|
|
@@ -425,7 +442,7 @@ module Classifier
|
|
|
425
442
|
# @rbs (String | Symbol, String) -> void
|
|
426
443
|
def train_single(category, text)
|
|
427
444
|
category = category.prepare_category_name
|
|
428
|
-
word_hash = text.word_hash
|
|
445
|
+
word_hash = text.word_hash(@min_word_length)
|
|
429
446
|
synchronize do
|
|
430
447
|
invalidate_caches
|
|
431
448
|
@dirty = true
|
|
@@ -443,7 +460,7 @@ module Classifier
|
|
|
443
460
|
# @rbs (String | Symbol, String) -> void
|
|
444
461
|
def untrain_single(category, text)
|
|
445
462
|
category = category.prepare_category_name
|
|
446
|
-
word_hash = text.word_hash
|
|
463
|
+
word_hash = text.word_hash(@min_word_length)
|
|
447
464
|
synchronize do
|
|
448
465
|
invalidate_caches
|
|
449
466
|
@dirty = true
|
|
@@ -487,6 +504,7 @@ module Classifier
|
|
|
487
504
|
@cached_vocab_size = nil
|
|
488
505
|
@dirty = false
|
|
489
506
|
@storage = nil
|
|
507
|
+
@min_word_length = data['min_word_length'] || Classifier.config.min_word_length
|
|
490
508
|
|
|
491
509
|
data['categories'].each do |cat_name, words|
|
|
492
510
|
@categories[cat_name.to_sym] = words.transform_keys(&:to_sym)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
module Classifier
|
|
4
|
+
# @rbs @config: Config?
|
|
5
|
+
|
|
6
|
+
# This lazy initialization is not thread-safe.
|
|
7
|
+
# In multi-threaded environments, ensure this method is called
|
|
8
|
+
# or configuration is set explicitly during startup before using classifiers.
|
|
9
|
+
# @rbs () -> Config
|
|
10
|
+
def config
|
|
11
|
+
@config ||= Config.new
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# @rbs () { (Config) -> void } -> void
|
|
15
|
+
def configure(&block)
|
|
16
|
+
block&.call(config)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
module_function :config, :configure
|
|
20
|
+
|
|
21
|
+
class Config
|
|
22
|
+
# @rbs @min_word_length: Integer
|
|
23
|
+
|
|
24
|
+
attr_accessor :min_word_length #: Integer
|
|
25
|
+
|
|
26
|
+
# @rbs () -> void
|
|
27
|
+
def initialize
|
|
28
|
+
@min_word_length = 3
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -20,27 +20,27 @@ class String
|
|
|
20
20
|
|
|
21
21
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
|
22
22
|
# interned, and indexes to its frequency in the document.
|
|
23
|
-
# @rbs () -> Hash[Symbol, Integer]
|
|
24
|
-
def word_hash
|
|
25
|
-
word_hash = clean_word_hash
|
|
23
|
+
# @rbs (?Integer) -> Hash[Symbol, Integer]
|
|
24
|
+
def word_hash(min_word_length = 3)
|
|
25
|
+
word_hash = clean_word_hash(min_word_length)
|
|
26
26
|
symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
|
|
27
27
|
word_hash.merge(symbol_hash)
|
|
28
28
|
end
|
|
29
29
|
|
|
30
30
|
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
|
31
|
-
# @rbs () -> Hash[Symbol, Integer]
|
|
32
|
-
def clean_word_hash
|
|
33
|
-
word_hash_for_words
|
|
31
|
+
# @rbs (?Integer) -> Hash[Symbol, Integer]
|
|
32
|
+
def clean_word_hash(min_word_length = 3)
|
|
33
|
+
word_hash_for_words(gsub(/[^\w\s]/, '').split, min_word_length)
|
|
34
34
|
end
|
|
35
35
|
|
|
36
36
|
private
|
|
37
37
|
|
|
38
|
-
# @rbs (Array[String]) -> Hash[Symbol, Integer]
|
|
39
|
-
def word_hash_for_words(words)
|
|
38
|
+
# @rbs (Array[String], Integer) -> Hash[Symbol, Integer]
|
|
39
|
+
def word_hash_for_words(words, min_word_length)
|
|
40
40
|
d = Hash.new(0)
|
|
41
41
|
words.each do |word|
|
|
42
42
|
word.downcase!
|
|
43
|
-
d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length
|
|
43
|
+
d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length >= min_word_length
|
|
44
44
|
end
|
|
45
45
|
d
|
|
46
46
|
end
|
data/lib/classifier/knn.rb
CHANGED
|
@@ -268,9 +268,10 @@ module Classifier
|
|
|
268
268
|
# puts "#{progress.completed} documents processed"
|
|
269
269
|
# end
|
|
270
270
|
#
|
|
271
|
-
# @rbs (String | Symbol, IO
|
|
272
|
-
def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE, &
|
|
273
|
-
@
|
|
271
|
+
# @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
|
|
272
|
+
def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
|
|
273
|
+
# @type var categories: untype
|
|
274
|
+
@lsi.train_from_stream(category, io, batch_size: batch_size, **categories, &)
|
|
274
275
|
synchronize { @dirty = true }
|
|
275
276
|
end
|
|
276
277
|
|
|
@@ -34,6 +34,7 @@ module Classifier
|
|
|
34
34
|
# @rbs @fitted: bool
|
|
35
35
|
# @rbs @dirty: bool
|
|
36
36
|
# @rbs @storage: Storage::Base?
|
|
37
|
+
# @rbs @min_word_length: Integer
|
|
37
38
|
|
|
38
39
|
attr_accessor :storage
|
|
39
40
|
|
|
@@ -53,13 +54,16 @@ module Classifier
|
|
|
53
54
|
# - regularization: L2 regularization strength (default: 0.01)
|
|
54
55
|
# - max_iterations: Maximum training iterations (default: 100)
|
|
55
56
|
# - tolerance: Convergence threshold (default: 1e-4)
|
|
57
|
+
# - min_word_length: Minimum word length filter in tokenization
|
|
56
58
|
#
|
|
57
59
|
# @rbs (*String | Symbol | Array[String | Symbol], ?learning_rate: Float, ?regularization: Float,
|
|
58
|
-
# ?max_iterations: Integer, ?tolerance: Float) -> void
|
|
60
|
+
# ?max_iterations: Integer, ?tolerance: Float, ?min_word_length: Integer) -> void
|
|
61
|
+
# rubocop:disable Metrics/ParameterLists
|
|
59
62
|
def initialize(*categories, learning_rate: DEFAULT_LEARNING_RATE,
|
|
60
63
|
regularization: DEFAULT_REGULARIZATION,
|
|
61
64
|
max_iterations: DEFAULT_MAX_ITERATIONS,
|
|
62
|
-
tolerance: DEFAULT_TOLERANCE
|
|
65
|
+
tolerance: DEFAULT_TOLERANCE,
|
|
66
|
+
min_word_length: Classifier.config.min_word_length)
|
|
63
67
|
super()
|
|
64
68
|
categories = categories.flatten
|
|
65
69
|
@categories = categories.map { |c| c.to_s.prepare_category_name }
|
|
@@ -74,7 +78,9 @@ module Classifier
|
|
|
74
78
|
@fitted = false
|
|
75
79
|
@dirty = false
|
|
76
80
|
@storage = nil
|
|
81
|
+
@min_word_length = min_word_length
|
|
77
82
|
end
|
|
83
|
+
# rubocop:enable Metrics/ParameterLists
|
|
78
84
|
|
|
79
85
|
# Trains the classifier with text for a category.
|
|
80
86
|
#
|
|
@@ -130,7 +136,7 @@ module Classifier
|
|
|
130
136
|
def probabilities(text)
|
|
131
137
|
raise NotFittedError, 'Model not fitted. Call fit() after training.' unless @fitted
|
|
132
138
|
|
|
133
|
-
features = text.word_hash
|
|
139
|
+
features = text.word_hash(@min_word_length)
|
|
134
140
|
synchronize do
|
|
135
141
|
softmax(compute_scores(features))
|
|
136
142
|
end
|
|
@@ -143,7 +149,7 @@ module Classifier
|
|
|
143
149
|
def classifications(text)
|
|
144
150
|
raise NotFittedError, 'Model not fitted. Call fit() after training.' unless @fitted
|
|
145
151
|
|
|
146
|
-
features = text.word_hash
|
|
152
|
+
features = text.word_hash(@min_word_length)
|
|
147
153
|
synchronize do
|
|
148
154
|
compute_scores(features).transform_keys(&:to_s)
|
|
149
155
|
end
|
|
@@ -239,7 +245,8 @@ module Classifier
|
|
|
239
245
|
regularization: @regularization,
|
|
240
246
|
max_iterations: @max_iterations,
|
|
241
247
|
tolerance: @tolerance,
|
|
242
|
-
fitted: @fitted
|
|
248
|
+
fitted: @fitted,
|
|
249
|
+
min_word_length: @min_word_length
|
|
243
250
|
}
|
|
244
251
|
end
|
|
245
252
|
|
|
@@ -336,7 +343,7 @@ module Classifier
|
|
|
336
343
|
def marshal_dump
|
|
337
344
|
fit unless @fitted
|
|
338
345
|
[@categories, @weights, @bias, @vocabulary, @learning_rate, @regularization,
|
|
339
|
-
@max_iterations, @tolerance, @fitted]
|
|
346
|
+
@max_iterations, @tolerance, @fitted, @min_word_length]
|
|
340
347
|
end
|
|
341
348
|
|
|
342
349
|
# Custom marshal deserialization to recreate mutex.
|
|
@@ -345,7 +352,7 @@ module Classifier
|
|
|
345
352
|
def marshal_load(data)
|
|
346
353
|
mu_initialize
|
|
347
354
|
@categories, @weights, @bias, @vocabulary, @learning_rate, @regularization,
|
|
348
|
-
@max_iterations, @tolerance, @fitted = data
|
|
355
|
+
@max_iterations, @tolerance, @fitted, @min_word_length = data
|
|
349
356
|
@training_data = []
|
|
350
357
|
@dirty = false
|
|
351
358
|
@storage = nil
|
|
@@ -383,28 +390,14 @@ module Classifier
|
|
|
383
390
|
# end
|
|
384
391
|
# classifier.fit
|
|
385
392
|
#
|
|
386
|
-
# @rbs (String | Symbol, IO
|
|
387
|
-
def train_from_stream(category, io, batch_size: Streaming::DEFAULT_BATCH_SIZE)
|
|
388
|
-
category
|
|
389
|
-
raise
|
|
390
|
-
|
|
391
|
-
reader = Streaming::LineReader.new(io, batch_size: batch_size)
|
|
392
|
-
total = reader.estimate_line_count
|
|
393
|
-
progress = Streaming::Progress.new(total: total)
|
|
393
|
+
# @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
|
|
394
|
+
def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
|
|
395
|
+
raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
|
|
396
|
+
raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
|
|
394
397
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
features = text.word_hash
|
|
399
|
-
features.each_key { |word| @vocabulary[word] = true }
|
|
400
|
-
@training_data << { category: category, features: features }
|
|
401
|
-
end
|
|
402
|
-
@fitted = false
|
|
403
|
-
@dirty = true
|
|
404
|
-
end
|
|
405
|
-
progress.completed += batch.size
|
|
406
|
-
progress.current_batch += 1
|
|
407
|
-
yield progress if block_given?
|
|
398
|
+
pairs = category && io ? { category => io } : categories
|
|
399
|
+
pairs.each do |cat, stream|
|
|
400
|
+
stream_train_category(cat, stream, batch_size:, &)
|
|
408
401
|
end
|
|
409
402
|
end
|
|
410
403
|
|
|
@@ -433,6 +426,33 @@ module Classifier
|
|
|
433
426
|
|
|
434
427
|
private
|
|
435
428
|
|
|
429
|
+
# Trains from an IO stream with a single category.
|
|
430
|
+
# @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
|
|
431
|
+
def stream_train_category(category, io, batch_size:)
|
|
432
|
+
category = category.to_s.prepare_category_name
|
|
433
|
+
raise ArgumentError, "No such category: #{category}" unless @categories.include?(category)
|
|
434
|
+
raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
|
|
435
|
+
|
|
436
|
+
reader = Streaming::LineReader.new(io, batch_size: batch_size)
|
|
437
|
+
total = reader.estimate_line_count
|
|
438
|
+
progress = Streaming::Progress.new(total: total)
|
|
439
|
+
|
|
440
|
+
reader.each_batch do |batch|
|
|
441
|
+
synchronize do
|
|
442
|
+
batch.each do |text|
|
|
443
|
+
features = text.word_hash(@min_word_length)
|
|
444
|
+
features.each_key { |word| @vocabulary[word] = true }
|
|
445
|
+
@training_data << { category: category, features: features }
|
|
446
|
+
end
|
|
447
|
+
@fitted = false
|
|
448
|
+
@dirty = true
|
|
449
|
+
end
|
|
450
|
+
progress.completed += batch.size
|
|
451
|
+
progress.current_batch += 1
|
|
452
|
+
yield progress if block_given?
|
|
453
|
+
end
|
|
454
|
+
end
|
|
455
|
+
|
|
436
456
|
# Trains a batch of documents for a single category.
|
|
437
457
|
# @rbs (String | Symbol, Array[String], ?batch_size: Integer) { (Streaming::Progress) -> void } -> void
|
|
438
458
|
def train_batch_for_category(category, documents, batch_size: Streaming::DEFAULT_BATCH_SIZE)
|
|
@@ -444,7 +464,7 @@ module Classifier
|
|
|
444
464
|
documents.each_slice(batch_size) do |batch|
|
|
445
465
|
synchronize do
|
|
446
466
|
batch.each do |text|
|
|
447
|
-
features = text.word_hash
|
|
467
|
+
features = text.word_hash(@min_word_length)
|
|
448
468
|
features.each_key { |word| @vocabulary[word] = true }
|
|
449
469
|
@training_data << { category: category, features: features }
|
|
450
470
|
end
|
|
@@ -463,7 +483,7 @@ module Classifier
|
|
|
463
483
|
category = category.to_s.prepare_category_name
|
|
464
484
|
raise StandardError, "No such category: #{category}" unless @categories.include?(category)
|
|
465
485
|
|
|
466
|
-
features = text.word_hash
|
|
486
|
+
features = text.word_hash(@min_word_length)
|
|
467
487
|
synchronize do
|
|
468
488
|
features.each_key { |word| @vocabulary[word] = true }
|
|
469
489
|
@training_data << { category: category, features: features }
|
|
@@ -570,6 +590,7 @@ module Classifier
|
|
|
570
590
|
@fitted = data.fetch('fitted', true)
|
|
571
591
|
@dirty = false
|
|
572
592
|
@storage = nil
|
|
593
|
+
@min_word_length = data['min_word_length'] || Classifier.config.min_word_length
|
|
573
594
|
end
|
|
574
595
|
|
|
575
596
|
def restore_weights_and_bias(data)
|
data/lib/classifier/lsi.rb
CHANGED
|
@@ -80,6 +80,7 @@ module Classifier
|
|
|
80
80
|
# @rbs @u_matrix: Matrix?
|
|
81
81
|
# @rbs @max_rank: Integer
|
|
82
82
|
# @rbs @initial_vocab_size: Integer?
|
|
83
|
+
# @rbs @min_word_length: Integer
|
|
83
84
|
|
|
84
85
|
attr_reader :word_list, :singular_values
|
|
85
86
|
attr_accessor :auto_rebuild, :storage
|
|
@@ -110,6 +111,7 @@ module Classifier
|
|
|
110
111
|
@max_rank = options[:max_rank] || DEFAULT_MAX_RANK
|
|
111
112
|
@u_matrix = nil
|
|
112
113
|
@initial_vocab_size = nil
|
|
114
|
+
@min_word_length = options[:min_word_length] || Classifier.config.min_word_length
|
|
113
115
|
end
|
|
114
116
|
|
|
115
117
|
# Returns true if the index needs to be rebuilt. The index needs
|
|
@@ -216,7 +218,13 @@ module Classifier
|
|
|
216
218
|
#
|
|
217
219
|
# @rbs (String, *String | Symbol) ?{ (String) -> String } -> void
|
|
218
220
|
def add_item(item, *categories, &block)
|
|
219
|
-
clean_word_hash =
|
|
221
|
+
clean_word_hash =
|
|
222
|
+
if block
|
|
223
|
+
block.call(item).clean_word_hash(@min_word_length)
|
|
224
|
+
else
|
|
225
|
+
item.to_s.clean_word_hash(@min_word_length)
|
|
226
|
+
end
|
|
227
|
+
|
|
220
228
|
node = nil
|
|
221
229
|
|
|
222
230
|
synchronize do
|
|
@@ -480,14 +488,15 @@ module Classifier
|
|
|
480
488
|
# Custom marshal serialization to exclude mutex state
|
|
481
489
|
# @rbs () -> Array[untyped]
|
|
482
490
|
def marshal_dump
|
|
483
|
-
[@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty]
|
|
491
|
+
[@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty, @min_word_length]
|
|
484
492
|
end
|
|
485
493
|
|
|
486
494
|
# Custom marshal deserialization to recreate mutex
|
|
487
495
|
# @rbs (Array[untyped]) -> void
|
|
488
496
|
def marshal_load(data)
|
|
489
497
|
mu_initialize
|
|
490
|
-
@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty
|
|
498
|
+
@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty,
|
|
499
|
+
@min_word_length = data
|
|
491
500
|
@storage = nil
|
|
492
501
|
end
|
|
493
502
|
|
|
@@ -653,21 +662,22 @@ module Classifier
|
|
|
653
662
|
# puts "#{progress.completed} documents processed"
|
|
654
663
|
# end
|
|
655
664
|
#
|
|
656
|
-
#
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
665
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
666
|
+
# @rbs (?(String | Symbol | nil), ?IO?, ?batch_size: Integer, **IO) { (Streaming::Progress) -> void } -> void
|
|
667
|
+
def train_from_stream(category = nil, io = nil, batch_size: Streaming::DEFAULT_BATCH_SIZE, **categories, &)
|
|
668
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
669
|
+
raise ArgumentError, 'Provide either (category, io) or keyword category: io pairs' if category.nil? && io.nil? && categories.empty?
|
|
670
|
+
raise ArgumentError, 'Provide both category and io, or use keyword arguments' if [category, io].one?(&:nil?)
|
|
671
|
+
|
|
672
|
+
pairs = category && io ? { category => io } : categories
|
|
673
|
+
pairs.each_value do |io|
|
|
674
|
+
raise ArgumentError, 'Stream must respond to #each_line' unless io.respond_to?(:each_line)
|
|
675
|
+
end
|
|
661
676
|
begin
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
reader.each_batch do |batch|
|
|
667
|
-
batch.each { |text| add_item(text, category) }
|
|
668
|
-
progress.completed += batch.size
|
|
669
|
-
progress.current_batch += 1
|
|
670
|
-
yield progress if block_given?
|
|
677
|
+
original_auto_rebuild = @auto_rebuild
|
|
678
|
+
@auto_rebuild = false
|
|
679
|
+
pairs.each do |cat, stream|
|
|
680
|
+
stream_train_category(cat, stream, batch_size:, &)
|
|
671
681
|
end
|
|
672
682
|
ensure
|
|
673
683
|
@auto_rebuild = original_auto_rebuild
|
|
@@ -720,6 +730,21 @@ module Classifier
|
|
|
720
730
|
|
|
721
731
|
private
|
|
722
732
|
|
|
733
|
+
# Trains from an IO stream with a single category.
|
|
734
|
+
# @rbs (String | Symbol, IO, batch_size: Integer) { (Streaming::Progress) -> void } -> void
|
|
735
|
+
def stream_train_category(category, io, batch_size:)
|
|
736
|
+
reader = Streaming::LineReader.new(io, batch_size: batch_size)
|
|
737
|
+
total = reader.estimate_line_count
|
|
738
|
+
progress = Streaming::Progress.new(total: total)
|
|
739
|
+
|
|
740
|
+
reader.each_batch do |batch|
|
|
741
|
+
batch.each { |text| add_item(text, category) }
|
|
742
|
+
progress.completed += batch.size
|
|
743
|
+
progress.current_batch += 1
|
|
744
|
+
yield progress if block_given?
|
|
745
|
+
end
|
|
746
|
+
end
|
|
747
|
+
|
|
723
748
|
# Restores LSI state from a JSON string (used by reload)
|
|
724
749
|
# @rbs (String) -> void
|
|
725
750
|
def restore_from_json(json)
|
data/lib/classifier/streaming.rb
CHANGED
|
@@ -26,8 +26,8 @@ module Classifier
|
|
|
26
26
|
# Trains the classifier from an IO stream.
|
|
27
27
|
# Each line in the stream is treated as a separate document.
|
|
28
28
|
#
|
|
29
|
-
# @rbs (Symbol | String, IO
|
|
30
|
-
def train_from_stream(category, io, batch_size: DEFAULT_BATCH_SIZE, &block)
|
|
29
|
+
# @rbs (?(Symbol | String | nil), ?IO?, ?batch_size: Integer, **IO) { (Progress) -> void } -> void
|
|
30
|
+
def train_from_stream(category = nil, io = nil, batch_size: DEFAULT_BATCH_SIZE, **categories, &block)
|
|
31
31
|
raise NotImplementedError, "#{self.class} must implement train_from_stream"
|
|
32
32
|
end
|
|
33
33
|
|
data/lib/classifier/tfidf.rb
CHANGED
|
@@ -28,6 +28,7 @@ module Classifier
|
|
|
28
28
|
# @rbs @fitted: bool
|
|
29
29
|
# @rbs @dirty: bool
|
|
30
30
|
# @rbs @storage: Storage::Base?
|
|
31
|
+
# @rbs @min_word_length: Integer
|
|
31
32
|
|
|
32
33
|
attr_reader :vocabulary, :idf, :num_documents
|
|
33
34
|
attr_accessor :storage
|
|
@@ -36,10 +37,12 @@ module Classifier
|
|
|
36
37
|
# - min_df/max_df: filter terms by document frequency (Integer for count, Float for proportion)
|
|
37
38
|
# - ngram_range: [1,1] for unigrams, [1,2] for unigrams+bigrams
|
|
38
39
|
# - sublinear_tf: use 1 + log(tf) instead of raw term frequency
|
|
40
|
+
# - min_word_length: minimum word length filter in tokenization
|
|
39
41
|
#
|
|
40
42
|
# @rbs (?min_df: Integer | Float, ?max_df: Integer | Float,
|
|
41
|
-
# ?ngram_range: Array[Integer], ?sublinear_tf: bool) -> void
|
|
42
|
-
def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false
|
|
43
|
+
# ?ngram_range: Array[Integer], ?sublinear_tf: bool, ?min_word_length: Integer) -> void
|
|
44
|
+
def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false,
|
|
45
|
+
min_word_length: Classifier.config.min_word_length)
|
|
43
46
|
validate_df!(min_df, 'min_df')
|
|
44
47
|
validate_df!(max_df, 'max_df')
|
|
45
48
|
validate_ngram_range!(ngram_range)
|
|
@@ -54,6 +57,7 @@ module Classifier
|
|
|
54
57
|
@fitted = false
|
|
55
58
|
@dirty = false
|
|
56
59
|
@storage = nil
|
|
60
|
+
@min_word_length = min_word_length
|
|
57
61
|
end
|
|
58
62
|
|
|
59
63
|
# Learns vocabulary and IDF weights from the corpus.
|
|
@@ -204,7 +208,8 @@ module Classifier
|
|
|
204
208
|
vocabulary: @vocabulary,
|
|
205
209
|
idf: @idf,
|
|
206
210
|
num_documents: @num_documents,
|
|
207
|
-
fitted: @fitted
|
|
211
|
+
fitted: @fitted,
|
|
212
|
+
min_word_length: @min_word_length
|
|
208
213
|
}
|
|
209
214
|
end
|
|
210
215
|
|
|
@@ -223,7 +228,8 @@ module Classifier
|
|
|
223
228
|
min_df: data['min_df'],
|
|
224
229
|
max_df: data['max_df'],
|
|
225
230
|
ngram_range: data['ngram_range'],
|
|
226
|
-
sublinear_tf: data['sublinear_tf']
|
|
231
|
+
sublinear_tf: data['sublinear_tf'],
|
|
232
|
+
min_word_length: data['min_word_length'] || Classifier.config.min_word_length
|
|
227
233
|
)
|
|
228
234
|
|
|
229
235
|
instance.instance_variable_set(:@vocabulary, symbolize_keys(data['vocabulary']))
|
|
@@ -238,12 +244,14 @@ module Classifier
|
|
|
238
244
|
|
|
239
245
|
# @rbs () -> Array[untyped]
|
|
240
246
|
def marshal_dump
|
|
241
|
-
[@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted
|
|
247
|
+
[@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted,
|
|
248
|
+
@min_word_length]
|
|
242
249
|
end
|
|
243
250
|
|
|
244
251
|
# @rbs (Array[untyped]) -> void
|
|
245
252
|
def marshal_load(data)
|
|
246
|
-
@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted
|
|
253
|
+
@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted,
|
|
254
|
+
@min_word_length = data
|
|
247
255
|
@dirty = false
|
|
248
256
|
@storage = nil
|
|
249
257
|
end
|
|
@@ -334,7 +342,7 @@ module Classifier
|
|
|
334
342
|
result = Hash.new(0)
|
|
335
343
|
|
|
336
344
|
if @ngram_range[0] <= 1
|
|
337
|
-
word_hash = document.clean_word_hash
|
|
345
|
+
word_hash = document.clean_word_hash(@min_word_length)
|
|
338
346
|
word_hash.each { |term, count| result[term] += count }
|
|
339
347
|
end
|
|
340
348
|
|
data/lib/classifier/version.rb
CHANGED
data/lib/classifier.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: classifier
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Lucas Carlson
|
|
@@ -162,6 +162,7 @@ files:
|
|
|
162
162
|
- lib/classifier.rb
|
|
163
163
|
- lib/classifier/bayes.rb
|
|
164
164
|
- lib/classifier/cli.rb
|
|
165
|
+
- lib/classifier/config.rb
|
|
165
166
|
- lib/classifier/errors.rb
|
|
166
167
|
- lib/classifier/extensions/string.rb
|
|
167
168
|
- lib/classifier/extensions/vector.rb
|
|
@@ -213,7 +214,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
213
214
|
- !ruby/object:Gem::Version
|
|
214
215
|
version: '0'
|
|
215
216
|
requirements: []
|
|
216
|
-
rubygems_version: 4.0.
|
|
217
|
+
rubygems_version: 4.0.10
|
|
217
218
|
specification_version: 4
|
|
218
219
|
summary: Text classification with Bayesian, LSI, Logistic Regression, kNN, and TF-IDF
|
|
219
220
|
vectorization.
|