classifier 2.3.2 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +21 -5
- data/lib/classifier/bayes.rb +12 -7
- data/lib/classifier/config.rb +31 -0
- data/lib/classifier/extensions/word_hash.rb +9 -9
- data/lib/classifier/logistic_regression.rb +18 -10
- data/lib/classifier/lsi.rb +12 -3
- data/lib/classifier/tfidf.rb +15 -7
- data/lib/classifier/version.rb +1 -1
- data/lib/classifier.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c30fac948021b0009e53c7c4a232ac3e2472707fd7fb476cbb3f36e4912af399
|
|
4
|
+
data.tar.gz: 7120fa872d6ae6b49a8117c6dd672f2ee63d8b62384b348c7db8ca8cf790f88a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e9250266207fe481dfec4d09fc8d30a97e591649edcea4d0a975a5a59c379819341fa46b219ee3ecc218cbb1c156fd708bb288478b322dde81333000abecf43f
|
|
7
|
+
data.tar.gz: 7dbf662acb9b5819dd77224690668654211de7abe2c0b009fc059696880479401bd0e2e503f002b6862da955bf794990b00784856751438898c999b2f9cc49f6
|
data/README.md
CHANGED
|
@@ -27,7 +27,7 @@ gem 'classifier'
|
|
|
27
27
|
Or install via Homebrew for CLI-only usage:
|
|
28
28
|
|
|
29
29
|
```bash
|
|
30
|
-
brew install classifier
|
|
30
|
+
brew install cardmagic/tap/classifier
|
|
31
31
|
```
|
|
32
32
|
|
|
33
33
|
## Command Line
|
|
@@ -36,15 +36,15 @@ Classify text instantly with pre-trained models—no coding required:
|
|
|
36
36
|
|
|
37
37
|
```bash
|
|
38
38
|
# Detect spam
|
|
39
|
-
classifier
|
|
39
|
+
classifier -r sms-spam-filter "You won a free iPhone"
|
|
40
40
|
# => spam
|
|
41
41
|
|
|
42
42
|
# Analyze sentiment
|
|
43
|
-
classifier
|
|
43
|
+
classifier -r imdb-sentiment "This movie was absolutely amazing"
|
|
44
44
|
# => positive
|
|
45
45
|
|
|
46
46
|
# Detect emotions
|
|
47
|
-
classifier
|
|
47
|
+
classifier -r emotion-detection "I am so happy today"
|
|
48
48
|
# => joy
|
|
49
49
|
|
|
50
50
|
# List all available models
|
|
@@ -59,12 +59,28 @@ classifier train positive reviews/good/*.txt
|
|
|
59
59
|
classifier train negative reviews/bad/*.txt
|
|
60
60
|
|
|
61
61
|
# Classify new text
|
|
62
|
-
classifier
|
|
62
|
+
classifier "Great product, highly recommend"
|
|
63
63
|
# => positive
|
|
64
64
|
```
|
|
65
65
|
|
|
66
66
|
[CLI Guide →](https://rubyclassifier.com/docs/guides/cli/basics)
|
|
67
67
|
|
|
68
|
+
### Claude Code Plugin
|
|
69
|
+
|
|
70
|
+
Install as a plugin to get skills (auto-invoked) and slash commands:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
# Add the marketplace
|
|
74
|
+
claude plugin marketplace add cardmagic/ai-marketplace
|
|
75
|
+
|
|
76
|
+
# Install the plugin
|
|
77
|
+
claude plugin install classifier@cardmagic
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
This gives you:
|
|
81
|
+
- **Skill**: Claude automatically classifies text when you ask about spam, sentiment, or emotions
|
|
82
|
+
- **Slash commands**: `/classifier:classify`, `/classifier:train`, `/classifier:models`
|
|
83
|
+
|
|
68
84
|
## Quick Start
|
|
69
85
|
|
|
70
86
|
### Bayesian
|
data/lib/classifier/bayes.rb
CHANGED
|
@@ -20,6 +20,7 @@ module Classifier
|
|
|
20
20
|
# @rbs @cached_vocab_size: Integer?
|
|
21
21
|
# @rbs @dirty: bool
|
|
22
22
|
# @rbs @storage: Storage::Base?
|
|
23
|
+
# @rbs @min_word_length: Integer
|
|
23
24
|
|
|
24
25
|
attr_accessor :storage
|
|
25
26
|
|
|
@@ -27,8 +28,9 @@ module Classifier
|
|
|
27
28
|
# initialized and given a training method. E.g.,
|
|
28
29
|
# b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
|
29
30
|
# b = Classifier::Bayes.new ['Interesting', 'Uninteresting', 'Spam']
|
|
30
|
-
#
|
|
31
|
-
|
|
31
|
+
# b = Classifier::Bayes.new 'Spam', min_word_length: 1
|
|
32
|
+
# @rbs (*String | Symbol | Array[String | Symbol], ?min_word_length: Integer) -> void
|
|
33
|
+
def initialize(*categories, min_word_length: Classifier.config.min_word_length)
|
|
32
34
|
super()
|
|
33
35
|
@categories = {}
|
|
34
36
|
categories.flatten.each { |category| @categories[category.prepare_category_name] = {} }
|
|
@@ -39,6 +41,7 @@ module Classifier
|
|
|
39
41
|
@cached_vocab_size = nil
|
|
40
42
|
@dirty = false
|
|
41
43
|
@storage = nil
|
|
44
|
+
@min_word_length = min_word_length
|
|
42
45
|
end
|
|
43
46
|
|
|
44
47
|
# Trains the classifier with text for a category.
|
|
@@ -76,7 +79,7 @@ module Classifier
|
|
|
76
79
|
#
|
|
77
80
|
# @rbs (String) -> Hash[String, Float]
|
|
78
81
|
def classifications(text)
|
|
79
|
-
words = text.word_hash.keys
|
|
82
|
+
words = text.word_hash(@min_word_length).keys
|
|
80
83
|
synchronize do
|
|
81
84
|
training_count = cached_training_count
|
|
82
85
|
vocab_size = cached_vocab_size
|
|
@@ -117,7 +120,8 @@ module Classifier
|
|
|
117
120
|
categories: @categories.transform_keys(&:to_s).transform_values { |v| v.transform_keys(&:to_s) },
|
|
118
121
|
total_words: @total_words,
|
|
119
122
|
category_counts: @category_counts.transform_keys(&:to_s),
|
|
120
|
-
category_word_count: @category_word_count.transform_keys(&:to_s)
|
|
123
|
+
category_word_count: @category_word_count.transform_keys(&:to_s),
|
|
124
|
+
min_word_length: @min_word_length
|
|
121
125
|
}
|
|
122
126
|
end
|
|
123
127
|
|
|
@@ -409,7 +413,7 @@ module Classifier
|
|
|
409
413
|
invalidate_caches
|
|
410
414
|
@dirty = true
|
|
411
415
|
batch.each do |text|
|
|
412
|
-
word_hash = text.word_hash
|
|
416
|
+
word_hash = text.word_hash(@min_word_length)
|
|
413
417
|
@category_counts[category] += 1
|
|
414
418
|
word_hash.each do |word, count|
|
|
415
419
|
@categories[category][word] ||= 0
|
|
@@ -425,7 +429,7 @@ module Classifier
|
|
|
425
429
|
# @rbs (String | Symbol, String) -> void
|
|
426
430
|
def train_single(category, text)
|
|
427
431
|
category = category.prepare_category_name
|
|
428
|
-
word_hash = text.word_hash
|
|
432
|
+
word_hash = text.word_hash(@min_word_length)
|
|
429
433
|
synchronize do
|
|
430
434
|
invalidate_caches
|
|
431
435
|
@dirty = true
|
|
@@ -443,7 +447,7 @@ module Classifier
|
|
|
443
447
|
# @rbs (String | Symbol, String) -> void
|
|
444
448
|
def untrain_single(category, text)
|
|
445
449
|
category = category.prepare_category_name
|
|
446
|
-
word_hash = text.word_hash
|
|
450
|
+
word_hash = text.word_hash(@min_word_length)
|
|
447
451
|
synchronize do
|
|
448
452
|
invalidate_caches
|
|
449
453
|
@dirty = true
|
|
@@ -487,6 +491,7 @@ module Classifier
|
|
|
487
491
|
@cached_vocab_size = nil
|
|
488
492
|
@dirty = false
|
|
489
493
|
@storage = nil
|
|
494
|
+
@min_word_length = data['min_word_length'] || Classifier.config.min_word_length
|
|
490
495
|
|
|
491
496
|
data['categories'].each do |cat_name, words|
|
|
492
497
|
@categories[cat_name.to_sym] = words.transform_keys(&:to_sym)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# rbs_inline: enabled
|
|
2
|
+
|
|
3
|
+
module Classifier
|
|
4
|
+
# @rbs @config: Config?
|
|
5
|
+
|
|
6
|
+
# This lazy initialization is not thread-safe.
|
|
7
|
+
# In multi-threaded environments, ensure this method is called
|
|
8
|
+
# or configuration is set explicitly during startup before using classifiers.
|
|
9
|
+
# @rbs () -> Config
|
|
10
|
+
def config
|
|
11
|
+
@config ||= Config.new
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# @rbs () { (Config) -> void } -> void
|
|
15
|
+
def configure(&block)
|
|
16
|
+
block&.call(config)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
module_function :config, :configure
|
|
20
|
+
|
|
21
|
+
class Config
|
|
22
|
+
# @rbs @min_word_length: Integer
|
|
23
|
+
|
|
24
|
+
attr_accessor :min_word_length #: Integer
|
|
25
|
+
|
|
26
|
+
# @rbs () -> void
|
|
27
|
+
def initialize
|
|
28
|
+
@min_word_length = 3
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -20,27 +20,27 @@ class String
|
|
|
20
20
|
|
|
21
21
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
|
22
22
|
# interned, and indexes to its frequency in the document.
|
|
23
|
-
# @rbs () -> Hash[Symbol, Integer]
|
|
24
|
-
def word_hash
|
|
25
|
-
word_hash = clean_word_hash
|
|
23
|
+
# @rbs (?Integer) -> Hash[Symbol, Integer]
|
|
24
|
+
def word_hash(min_word_length = 3)
|
|
25
|
+
word_hash = clean_word_hash(min_word_length)
|
|
26
26
|
symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
|
|
27
27
|
word_hash.merge(symbol_hash)
|
|
28
28
|
end
|
|
29
29
|
|
|
30
30
|
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
|
31
|
-
# @rbs () -> Hash[Symbol, Integer]
|
|
32
|
-
def clean_word_hash
|
|
33
|
-
word_hash_for_words
|
|
31
|
+
# @rbs (?Integer) -> Hash[Symbol, Integer]
|
|
32
|
+
def clean_word_hash(min_word_length = 3)
|
|
33
|
+
word_hash_for_words(gsub(/[^\w\s]/, '').split, min_word_length)
|
|
34
34
|
end
|
|
35
35
|
|
|
36
36
|
private
|
|
37
37
|
|
|
38
|
-
# @rbs (Array[String]) -> Hash[Symbol, Integer]
|
|
39
|
-
def word_hash_for_words(words)
|
|
38
|
+
# @rbs (Array[String], Integer) -> Hash[Symbol, Integer]
|
|
39
|
+
def word_hash_for_words(words, min_word_length)
|
|
40
40
|
d = Hash.new(0)
|
|
41
41
|
words.each do |word|
|
|
42
42
|
word.downcase!
|
|
43
|
-
d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length
|
|
43
|
+
d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length >= min_word_length
|
|
44
44
|
end
|
|
45
45
|
d
|
|
46
46
|
end
|
|
@@ -34,6 +34,7 @@ module Classifier
|
|
|
34
34
|
# @rbs @fitted: bool
|
|
35
35
|
# @rbs @dirty: bool
|
|
36
36
|
# @rbs @storage: Storage::Base?
|
|
37
|
+
# @rbs @min_word_length: Integer
|
|
37
38
|
|
|
38
39
|
attr_accessor :storage
|
|
39
40
|
|
|
@@ -53,13 +54,16 @@ module Classifier
|
|
|
53
54
|
# - regularization: L2 regularization strength (default: 0.01)
|
|
54
55
|
# - max_iterations: Maximum training iterations (default: 100)
|
|
55
56
|
# - tolerance: Convergence threshold (default: 1e-4)
|
|
57
|
+
# - min_word_length: Minimum word length filter in tokenization
|
|
56
58
|
#
|
|
57
59
|
# @rbs (*String | Symbol | Array[String | Symbol], ?learning_rate: Float, ?regularization: Float,
|
|
58
|
-
# ?max_iterations: Integer, ?tolerance: Float) -> void
|
|
60
|
+
# ?max_iterations: Integer, ?tolerance: Float, ?min_word_length: Integer) -> void
|
|
61
|
+
# rubocop:disable Metrics/ParameterLists
|
|
59
62
|
def initialize(*categories, learning_rate: DEFAULT_LEARNING_RATE,
|
|
60
63
|
regularization: DEFAULT_REGULARIZATION,
|
|
61
64
|
max_iterations: DEFAULT_MAX_ITERATIONS,
|
|
62
|
-
tolerance: DEFAULT_TOLERANCE
|
|
65
|
+
tolerance: DEFAULT_TOLERANCE,
|
|
66
|
+
min_word_length: Classifier.config.min_word_length)
|
|
63
67
|
super()
|
|
64
68
|
categories = categories.flatten
|
|
65
69
|
@categories = categories.map { |c| c.to_s.prepare_category_name }
|
|
@@ -74,7 +78,9 @@ module Classifier
|
|
|
74
78
|
@fitted = false
|
|
75
79
|
@dirty = false
|
|
76
80
|
@storage = nil
|
|
81
|
+
@min_word_length = min_word_length
|
|
77
82
|
end
|
|
83
|
+
# rubocop:enable Metrics/ParameterLists
|
|
78
84
|
|
|
79
85
|
# Trains the classifier with text for a category.
|
|
80
86
|
#
|
|
@@ -130,7 +136,7 @@ module Classifier
|
|
|
130
136
|
def probabilities(text)
|
|
131
137
|
raise NotFittedError, 'Model not fitted. Call fit() after training.' unless @fitted
|
|
132
138
|
|
|
133
|
-
features = text.word_hash
|
|
139
|
+
features = text.word_hash(@min_word_length)
|
|
134
140
|
synchronize do
|
|
135
141
|
softmax(compute_scores(features))
|
|
136
142
|
end
|
|
@@ -143,7 +149,7 @@ module Classifier
|
|
|
143
149
|
def classifications(text)
|
|
144
150
|
raise NotFittedError, 'Model not fitted. Call fit() after training.' unless @fitted
|
|
145
151
|
|
|
146
|
-
features = text.word_hash
|
|
152
|
+
features = text.word_hash(@min_word_length)
|
|
147
153
|
synchronize do
|
|
148
154
|
compute_scores(features).transform_keys(&:to_s)
|
|
149
155
|
end
|
|
@@ -239,7 +245,8 @@ module Classifier
|
|
|
239
245
|
regularization: @regularization,
|
|
240
246
|
max_iterations: @max_iterations,
|
|
241
247
|
tolerance: @tolerance,
|
|
242
|
-
fitted: @fitted
|
|
248
|
+
fitted: @fitted,
|
|
249
|
+
min_word_length: @min_word_length
|
|
243
250
|
}
|
|
244
251
|
end
|
|
245
252
|
|
|
@@ -336,7 +343,7 @@ module Classifier
|
|
|
336
343
|
def marshal_dump
|
|
337
344
|
fit unless @fitted
|
|
338
345
|
[@categories, @weights, @bias, @vocabulary, @learning_rate, @regularization,
|
|
339
|
-
@max_iterations, @tolerance, @fitted]
|
|
346
|
+
@max_iterations, @tolerance, @fitted, @min_word_length]
|
|
340
347
|
end
|
|
341
348
|
|
|
342
349
|
# Custom marshal deserialization to recreate mutex.
|
|
@@ -345,7 +352,7 @@ module Classifier
|
|
|
345
352
|
def marshal_load(data)
|
|
346
353
|
mu_initialize
|
|
347
354
|
@categories, @weights, @bias, @vocabulary, @learning_rate, @regularization,
|
|
348
|
-
@max_iterations, @tolerance, @fitted = data
|
|
355
|
+
@max_iterations, @tolerance, @fitted, @min_word_length = data
|
|
349
356
|
@training_data = []
|
|
350
357
|
@dirty = false
|
|
351
358
|
@storage = nil
|
|
@@ -395,7 +402,7 @@ module Classifier
|
|
|
395
402
|
reader.each_batch do |batch|
|
|
396
403
|
synchronize do
|
|
397
404
|
batch.each do |text|
|
|
398
|
-
features = text.word_hash
|
|
405
|
+
features = text.word_hash(@min_word_length)
|
|
399
406
|
features.each_key { |word| @vocabulary[word] = true }
|
|
400
407
|
@training_data << { category: category, features: features }
|
|
401
408
|
end
|
|
@@ -444,7 +451,7 @@ module Classifier
|
|
|
444
451
|
documents.each_slice(batch_size) do |batch|
|
|
445
452
|
synchronize do
|
|
446
453
|
batch.each do |text|
|
|
447
|
-
features = text.word_hash
|
|
454
|
+
features = text.word_hash(@min_word_length)
|
|
448
455
|
features.each_key { |word| @vocabulary[word] = true }
|
|
449
456
|
@training_data << { category: category, features: features }
|
|
450
457
|
end
|
|
@@ -463,7 +470,7 @@ module Classifier
|
|
|
463
470
|
category = category.to_s.prepare_category_name
|
|
464
471
|
raise StandardError, "No such category: #{category}" unless @categories.include?(category)
|
|
465
472
|
|
|
466
|
-
features = text.word_hash
|
|
473
|
+
features = text.word_hash(@min_word_length)
|
|
467
474
|
synchronize do
|
|
468
475
|
features.each_key { |word| @vocabulary[word] = true }
|
|
469
476
|
@training_data << { category: category, features: features }
|
|
@@ -570,6 +577,7 @@ module Classifier
|
|
|
570
577
|
@fitted = data.fetch('fitted', true)
|
|
571
578
|
@dirty = false
|
|
572
579
|
@storage = nil
|
|
580
|
+
@min_word_length = data['min_word_length'] || Classifier.config.min_word_length
|
|
573
581
|
end
|
|
574
582
|
|
|
575
583
|
def restore_weights_and_bias(data)
|
data/lib/classifier/lsi.rb
CHANGED
|
@@ -80,6 +80,7 @@ module Classifier
|
|
|
80
80
|
# @rbs @u_matrix: Matrix?
|
|
81
81
|
# @rbs @max_rank: Integer
|
|
82
82
|
# @rbs @initial_vocab_size: Integer?
|
|
83
|
+
# @rbs @min_word_length: Integer
|
|
83
84
|
|
|
84
85
|
attr_reader :word_list, :singular_values
|
|
85
86
|
attr_accessor :auto_rebuild, :storage
|
|
@@ -110,6 +111,7 @@ module Classifier
|
|
|
110
111
|
@max_rank = options[:max_rank] || DEFAULT_MAX_RANK
|
|
111
112
|
@u_matrix = nil
|
|
112
113
|
@initial_vocab_size = nil
|
|
114
|
+
@min_word_length = options[:min_word_length] || Classifier.config.min_word_length
|
|
113
115
|
end
|
|
114
116
|
|
|
115
117
|
# Returns true if the index needs to be rebuilt. The index needs
|
|
@@ -216,7 +218,13 @@ module Classifier
|
|
|
216
218
|
#
|
|
217
219
|
# @rbs (String, *String | Symbol) ?{ (String) -> String } -> void
|
|
218
220
|
def add_item(item, *categories, &block)
|
|
219
|
-
clean_word_hash =
|
|
221
|
+
clean_word_hash =
|
|
222
|
+
if block
|
|
223
|
+
block.call(item).clean_word_hash(@min_word_length)
|
|
224
|
+
else
|
|
225
|
+
item.to_s.clean_word_hash(@min_word_length)
|
|
226
|
+
end
|
|
227
|
+
|
|
220
228
|
node = nil
|
|
221
229
|
|
|
222
230
|
synchronize do
|
|
@@ -480,14 +488,15 @@ module Classifier
|
|
|
480
488
|
# Custom marshal serialization to exclude mutex state
|
|
481
489
|
# @rbs () -> Array[untyped]
|
|
482
490
|
def marshal_dump
|
|
483
|
-
[@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty]
|
|
491
|
+
[@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty, @min_word_length]
|
|
484
492
|
end
|
|
485
493
|
|
|
486
494
|
# Custom marshal deserialization to recreate mutex
|
|
487
495
|
# @rbs (Array[untyped]) -> void
|
|
488
496
|
def marshal_load(data)
|
|
489
497
|
mu_initialize
|
|
490
|
-
@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty
|
|
498
|
+
@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty,
|
|
499
|
+
@min_word_length = data
|
|
491
500
|
@storage = nil
|
|
492
501
|
end
|
|
493
502
|
|
data/lib/classifier/tfidf.rb
CHANGED
|
@@ -28,6 +28,7 @@ module Classifier
|
|
|
28
28
|
# @rbs @fitted: bool
|
|
29
29
|
# @rbs @dirty: bool
|
|
30
30
|
# @rbs @storage: Storage::Base?
|
|
31
|
+
# @rbs @min_word_length: Integer
|
|
31
32
|
|
|
32
33
|
attr_reader :vocabulary, :idf, :num_documents
|
|
33
34
|
attr_accessor :storage
|
|
@@ -36,10 +37,12 @@ module Classifier
|
|
|
36
37
|
# - min_df/max_df: filter terms by document frequency (Integer for count, Float for proportion)
|
|
37
38
|
# - ngram_range: [1,1] for unigrams, [1,2] for unigrams+bigrams
|
|
38
39
|
# - sublinear_tf: use 1 + log(tf) instead of raw term frequency
|
|
40
|
+
# - min_word_length: minimum word length filter in tokenization
|
|
39
41
|
#
|
|
40
42
|
# @rbs (?min_df: Integer | Float, ?max_df: Integer | Float,
|
|
41
|
-
# ?ngram_range: Array[Integer], ?sublinear_tf: bool) -> void
|
|
42
|
-
def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false
|
|
43
|
+
# ?ngram_range: Array[Integer], ?sublinear_tf: bool, ?min_word_length: Integer) -> void
|
|
44
|
+
def initialize(min_df: 1, max_df: 1.0, ngram_range: [1, 1], sublinear_tf: false,
|
|
45
|
+
min_word_length: Classifier.config.min_word_length)
|
|
43
46
|
validate_df!(min_df, 'min_df')
|
|
44
47
|
validate_df!(max_df, 'max_df')
|
|
45
48
|
validate_ngram_range!(ngram_range)
|
|
@@ -54,6 +57,7 @@ module Classifier
|
|
|
54
57
|
@fitted = false
|
|
55
58
|
@dirty = false
|
|
56
59
|
@storage = nil
|
|
60
|
+
@min_word_length = min_word_length
|
|
57
61
|
end
|
|
58
62
|
|
|
59
63
|
# Learns vocabulary and IDF weights from the corpus.
|
|
@@ -204,7 +208,8 @@ module Classifier
|
|
|
204
208
|
vocabulary: @vocabulary,
|
|
205
209
|
idf: @idf,
|
|
206
210
|
num_documents: @num_documents,
|
|
207
|
-
fitted: @fitted
|
|
211
|
+
fitted: @fitted,
|
|
212
|
+
min_word_length: @min_word_length
|
|
208
213
|
}
|
|
209
214
|
end
|
|
210
215
|
|
|
@@ -223,7 +228,8 @@ module Classifier
|
|
|
223
228
|
min_df: data['min_df'],
|
|
224
229
|
max_df: data['max_df'],
|
|
225
230
|
ngram_range: data['ngram_range'],
|
|
226
|
-
sublinear_tf: data['sublinear_tf']
|
|
231
|
+
sublinear_tf: data['sublinear_tf'],
|
|
232
|
+
min_word_length: data['min_word_length'] || Classifier.config.min_word_length
|
|
227
233
|
)
|
|
228
234
|
|
|
229
235
|
instance.instance_variable_set(:@vocabulary, symbolize_keys(data['vocabulary']))
|
|
@@ -238,12 +244,14 @@ module Classifier
|
|
|
238
244
|
|
|
239
245
|
# @rbs () -> Array[untyped]
|
|
240
246
|
def marshal_dump
|
|
241
|
-
[@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted
|
|
247
|
+
[@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted,
|
|
248
|
+
@min_word_length]
|
|
242
249
|
end
|
|
243
250
|
|
|
244
251
|
# @rbs (Array[untyped]) -> void
|
|
245
252
|
def marshal_load(data)
|
|
246
|
-
@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted
|
|
253
|
+
@min_df, @max_df, @ngram_range, @sublinear_tf, @vocabulary, @idf, @num_documents, @fitted,
|
|
254
|
+
@min_word_length = data
|
|
247
255
|
@dirty = false
|
|
248
256
|
@storage = nil
|
|
249
257
|
end
|
|
@@ -334,7 +342,7 @@ module Classifier
|
|
|
334
342
|
result = Hash.new(0)
|
|
335
343
|
|
|
336
344
|
if @ngram_range[0] <= 1
|
|
337
|
-
word_hash = document.clean_word_hash
|
|
345
|
+
word_hash = document.clean_word_hash(@min_word_length)
|
|
338
346
|
word_hash.each { |term, count| result[term] += count }
|
|
339
347
|
end
|
|
340
348
|
|
data/lib/classifier/version.rb
CHANGED
data/lib/classifier.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: classifier
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Lucas Carlson
|
|
@@ -162,6 +162,7 @@ files:
|
|
|
162
162
|
- lib/classifier.rb
|
|
163
163
|
- lib/classifier/bayes.rb
|
|
164
164
|
- lib/classifier/cli.rb
|
|
165
|
+
- lib/classifier/config.rb
|
|
165
166
|
- lib/classifier/errors.rb
|
|
166
167
|
- lib/classifier/extensions/string.rb
|
|
167
168
|
- lib/classifier/extensions/vector.rb
|
|
@@ -213,7 +214,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
213
214
|
- !ruby/object:Gem::Version
|
|
214
215
|
version: '0'
|
|
215
216
|
requirements: []
|
|
216
|
-
rubygems_version: 4.0.
|
|
217
|
+
rubygems_version: 4.0.10
|
|
217
218
|
specification_version: 4
|
|
218
219
|
summary: Text classification with Bayesian, LSI, Logistic Regression, kNN, and TF-IDF
|
|
219
220
|
vectorization.
|