classifier-reborn 2.0.4 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/LICENSE +74 -1
- data/README.markdown +57 -207
- data/data/stopwords/ar +104 -0
- data/data/stopwords/bn +362 -0
- data/data/stopwords/hi +97 -0
- data/data/stopwords/ja +43 -0
- data/data/stopwords/ru +420 -0
- data/data/stopwords/tr +199 -30
- data/data/stopwords/vi +647 -0
- data/data/stopwords/zh +125 -0
- data/lib/classifier-reborn/backends/bayes_memory_backend.rb +77 -0
- data/lib/classifier-reborn/backends/bayes_redis_backend.rb +109 -0
- data/lib/classifier-reborn/backends/no_redis_error.rb +14 -0
- data/lib/classifier-reborn/bayes.rb +141 -65
- data/lib/classifier-reborn/category_namer.rb +6 -4
- data/lib/classifier-reborn/extensions/hasher.rb +22 -39
- data/lib/classifier-reborn/extensions/token_filter/stemmer.rb +24 -0
- data/lib/classifier-reborn/extensions/token_filter/stopword.rb +48 -0
- data/lib/classifier-reborn/extensions/token_filter/symbol.rb +20 -0
- data/lib/classifier-reborn/extensions/tokenizer/token.rb +36 -0
- data/lib/classifier-reborn/extensions/tokenizer/whitespace.rb +28 -0
- data/lib/classifier-reborn/extensions/vector.rb +35 -28
- data/lib/classifier-reborn/extensions/vector_serialize.rb +10 -10
- data/lib/classifier-reborn/extensions/zero_vector.rb +7 -0
- data/lib/classifier-reborn/lsi/cached_content_node.rb +6 -5
- data/lib/classifier-reborn/lsi/content_node.rb +35 -25
- data/lib/classifier-reborn/lsi/summarizer.rb +7 -5
- data/lib/classifier-reborn/lsi/word_list.rb +5 -6
- data/lib/classifier-reborn/lsi.rb +166 -94
- data/lib/classifier-reborn/validators/classifier_validator.rb +170 -0
- data/lib/classifier-reborn/version.rb +3 -1
- data/lib/classifier-reborn.rb +12 -1
- metadata +98 -17
- data/bin/bayes.rb +0 -36
- data/bin/summarize.rb +0 -16
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ClassifierReborn
|
4
|
+
class BayesMemoryBackend
|
5
|
+
attr_reader :total_words, :total_trainings
|
6
|
+
|
7
|
+
# This class provides Memory as the storage backend for the classifier data structures
|
8
|
+
def initialize
|
9
|
+
@total_words = 0
|
10
|
+
@total_trainings = 0
|
11
|
+
@category_counts = {}
|
12
|
+
@categories = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
def update_total_words(diff)
|
16
|
+
@total_words += diff
|
17
|
+
end
|
18
|
+
|
19
|
+
def update_total_trainings(diff)
|
20
|
+
@total_trainings += diff
|
21
|
+
end
|
22
|
+
|
23
|
+
def category_training_count(category)
|
24
|
+
category_counts(category)[:training]
|
25
|
+
end
|
26
|
+
|
27
|
+
def update_category_training_count(category, diff)
|
28
|
+
category_counts(category)[:training] += diff
|
29
|
+
end
|
30
|
+
|
31
|
+
def category_has_trainings?(category)
|
32
|
+
@category_counts.key?(category) && category_training_count(category) > 0
|
33
|
+
end
|
34
|
+
|
35
|
+
def category_word_count(category)
|
36
|
+
category_counts(category)[:word]
|
37
|
+
end
|
38
|
+
|
39
|
+
def update_category_word_count(category, diff)
|
40
|
+
category_counts(category)[:word] += diff
|
41
|
+
end
|
42
|
+
|
43
|
+
def add_category(category)
|
44
|
+
@categories[category] ||= Hash.new(0)
|
45
|
+
end
|
46
|
+
|
47
|
+
def category_keys
|
48
|
+
@categories.keys
|
49
|
+
end
|
50
|
+
|
51
|
+
def category_word_frequency(category, word)
|
52
|
+
@categories[category][word]
|
53
|
+
end
|
54
|
+
|
55
|
+
def update_category_word_frequency(category, word, diff)
|
56
|
+
@categories[category][word] += diff
|
57
|
+
end
|
58
|
+
|
59
|
+
def delete_category_word(category, word)
|
60
|
+
@categories[category].delete(word)
|
61
|
+
end
|
62
|
+
|
63
|
+
def word_in_category?(category, word)
|
64
|
+
@categories[category].key?(word)
|
65
|
+
end
|
66
|
+
|
67
|
+
def reset
|
68
|
+
initialize
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
def category_counts(category)
|
74
|
+
@category_counts[category] ||= { training: 0, word: 0 }
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'no_redis_error'
|
4
|
+
# require redis when we run #intialize. This way only people using this backend
|
5
|
+
# will need to install and load the backend without having to
|
6
|
+
# require 'classifier-reborn/backends/bayes_redis_backend'
|
7
|
+
|
8
|
+
module ClassifierReborn
|
9
|
+
# This class provides Redis as the storage backend for the classifier data structures
|
10
|
+
class BayesRedisBackend
|
11
|
+
# The class can be created with the same arguments that the redis gem accepts
|
12
|
+
# E.g.,
|
13
|
+
# b = ClassifierReborn::BayesRedisBackend.new
|
14
|
+
# b = ClassifierReborn::BayesRedisBackend.new host: "10.0.1.1", port: 6380, db: 2
|
15
|
+
# b = ClassifierReborn::BayesRedisBackend.new url: "redis://:secret@10.0.1.1:6380/2"
|
16
|
+
#
|
17
|
+
# Options available are:
|
18
|
+
# url: lambda { ENV["REDIS_URL"] }
|
19
|
+
# scheme: "redis"
|
20
|
+
# host: "127.0.0.1"
|
21
|
+
# port: 6379
|
22
|
+
# path: nil
|
23
|
+
# timeout: 5.0
|
24
|
+
# password: nil
|
25
|
+
# db: 0
|
26
|
+
# driver: nil
|
27
|
+
# id: nil
|
28
|
+
# tcp_keepalive: 0
|
29
|
+
# reconnect_attempts: 1
|
30
|
+
# inherit_socket: false
|
31
|
+
def initialize(options = {})
|
32
|
+
begin # because some people don't have redis installed
|
33
|
+
require 'redis'
|
34
|
+
rescue LoadError
|
35
|
+
raise NoRedisError
|
36
|
+
end
|
37
|
+
|
38
|
+
@redis = Redis.new(options)
|
39
|
+
@redis.setnx(:total_words, 0)
|
40
|
+
@redis.setnx(:total_trainings, 0)
|
41
|
+
end
|
42
|
+
|
43
|
+
def total_words
|
44
|
+
@redis.get(:total_words).to_i
|
45
|
+
end
|
46
|
+
|
47
|
+
def update_total_words(diff)
|
48
|
+
@redis.incrby(:total_words, diff)
|
49
|
+
end
|
50
|
+
|
51
|
+
def total_trainings
|
52
|
+
@redis.get(:total_trainings).to_i
|
53
|
+
end
|
54
|
+
|
55
|
+
def update_total_trainings(diff)
|
56
|
+
@redis.incrby(:total_trainings, diff)
|
57
|
+
end
|
58
|
+
|
59
|
+
def category_training_count(category)
|
60
|
+
@redis.hget(:category_training_count, category).to_i
|
61
|
+
end
|
62
|
+
|
63
|
+
def update_category_training_count(category, diff)
|
64
|
+
@redis.hincrby(:category_training_count, category, diff)
|
65
|
+
end
|
66
|
+
|
67
|
+
def category_has_trainings?(category)
|
68
|
+
category_training_count(category) > 0
|
69
|
+
end
|
70
|
+
|
71
|
+
def category_word_count(category)
|
72
|
+
@redis.hget(:category_word_count, category).to_i
|
73
|
+
end
|
74
|
+
|
75
|
+
def update_category_word_count(category, diff)
|
76
|
+
@redis.hincrby(:category_word_count, category, diff)
|
77
|
+
end
|
78
|
+
|
79
|
+
def add_category(category)
|
80
|
+
@redis.sadd(:category_keys, category)
|
81
|
+
end
|
82
|
+
|
83
|
+
def category_keys
|
84
|
+
@redis.smembers(:category_keys).map(&:intern)
|
85
|
+
end
|
86
|
+
|
87
|
+
def category_word_frequency(category, word)
|
88
|
+
@redis.hget(category, word).to_i
|
89
|
+
end
|
90
|
+
|
91
|
+
def update_category_word_frequency(category, word, diff)
|
92
|
+
@redis.hincrby(category, word, diff)
|
93
|
+
end
|
94
|
+
|
95
|
+
def delete_category_word(category, word)
|
96
|
+
@redis.hdel(category, word)
|
97
|
+
end
|
98
|
+
|
99
|
+
def word_in_category?(category, word)
|
100
|
+
@redis.hexists(category, word)
|
101
|
+
end
|
102
|
+
|
103
|
+
def reset
|
104
|
+
@redis.flushdb
|
105
|
+
@redis.set(:total_words, 0)
|
106
|
+
@redis.set(:total_trainings, 0)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class NoRedisError < RuntimeError
|
4
|
+
def initialize
|
5
|
+
msg =
|
6
|
+
%q(The Redis Backend can only be used if Redis is installed.
|
7
|
+
This error is raised from 'lib/classifier-reborn/backends/bayes_redis_backend.rb'.
|
8
|
+
If you have encountered this error and would like to use the Redis Backend,
|
9
|
+
please run 'gem install redis' or include 'gem "redis"' in
|
10
|
+
your gemfile. For more info see https://github.com/jekyll/classifier-reborn#usage.
|
11
|
+
)
|
12
|
+
super(msg)
|
13
|
+
end
|
14
|
+
end
|
@@ -1,8 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
5
|
# License:: LGPL
|
4
6
|
|
7
|
+
require 'set'
|
8
|
+
|
9
|
+
require_relative 'extensions/tokenizer/whitespace'
|
10
|
+
require_relative 'extensions/token_filter/stopword'
|
11
|
+
require_relative 'extensions/token_filter/stemmer'
|
5
12
|
require_relative 'category_namer'
|
13
|
+
require_relative 'backends/bayes_memory_backend'
|
14
|
+
require_relative 'backends/bayes_redis_backend'
|
6
15
|
|
7
16
|
module ClassifierReborn
|
8
17
|
class Bayes
|
@@ -13,33 +22,46 @@ module ClassifierReborn
|
|
13
22
|
# b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
14
23
|
#
|
15
24
|
# Options available are:
|
16
|
-
# language: 'en'
|
17
|
-
# auto_categorize: false
|
18
|
-
# enable_threshold: false
|
19
|
-
# threshold: 0.0
|
25
|
+
# language: 'en' Used to select language specific stop words
|
26
|
+
# auto_categorize: false When true, enables ability to dynamically declare a category; the default is true if no initial categories are provided
|
27
|
+
# enable_threshold: false When true, enables a threshold requirement for classifition
|
28
|
+
# threshold: 0.0 Default threshold, only used when enabled
|
29
|
+
# enable_stemmer: true When false, disables word stemming
|
30
|
+
# stopwords: nil Accepts path to a text file or an array of words, when supplied, overwrites the default stopwords; assign empty string or array to disable stopwords
|
31
|
+
# backend: BayesMemoryBackend.new Alternatively, BayesRedisBackend.new for persistent storage
|
20
32
|
def initialize(*args)
|
21
|
-
@
|
22
|
-
options = { language:
|
23
|
-
auto_categorize: false,
|
33
|
+
@initial_categories = []
|
34
|
+
options = { language: 'en',
|
24
35
|
enable_threshold: false,
|
25
|
-
threshold:
|
26
|
-
|
27
|
-
|
28
|
-
|
36
|
+
threshold: 0.0,
|
37
|
+
enable_stemmer: true,
|
38
|
+
backend: BayesMemoryBackend.new }
|
39
|
+
args.flatten.each do |arg|
|
40
|
+
if arg.is_a?(Hash)
|
29
41
|
options.merge!(arg)
|
30
42
|
else
|
31
|
-
|
43
|
+
@initial_categories.push(arg)
|
32
44
|
end
|
33
|
-
|
45
|
+
end
|
34
46
|
|
35
|
-
|
36
|
-
|
37
|
-
|
47
|
+
unless options.key?(:auto_categorize)
|
48
|
+
options[:auto_categorize] = @initial_categories.empty? ? true : false
|
49
|
+
end
|
38
50
|
|
39
51
|
@language = options[:language]
|
40
52
|
@auto_categorize = options[:auto_categorize]
|
41
53
|
@enable_threshold = options[:enable_threshold]
|
42
54
|
@threshold = options[:threshold]
|
55
|
+
@enable_stemmer = options[:enable_stemmer]
|
56
|
+
@backend = options[:backend]
|
57
|
+
@tokenizer = options[:tokenizer] || Tokenizer::Whitespace
|
58
|
+
@token_filters = options[:token_filters] || [TokenFilter::Stopword]
|
59
|
+
@token_filters << TokenFilter::Stemmer if @enable_stemmer && !@token_filters.include?(TokenFilter::Stemmer)
|
60
|
+
TokenFilter::Stopword.language = @language if @token_filters.include?(TokenFilter::Stopword)
|
61
|
+
|
62
|
+
populate_initial_categories
|
63
|
+
|
64
|
+
custom_stopwords options[:stopwords] if options.key?(:stopwords)
|
43
65
|
end
|
44
66
|
|
45
67
|
# Provides a general training method for all categories specified in Bayes#new
|
@@ -49,23 +71,28 @@ module ClassifierReborn
|
|
49
71
|
# b.train "that", "That text"
|
50
72
|
# b.train "The other", "The other text"
|
51
73
|
def train(category, text)
|
74
|
+
word_hash = Hasher.word_hash(text, @enable_stemmer,
|
75
|
+
tokenizer: @tokenizer, token_filters: @token_filters)
|
76
|
+
return if word_hash.empty?
|
77
|
+
|
52
78
|
category = CategoryNamer.prepare_name(category)
|
53
79
|
|
54
80
|
# Add the category dynamically or raise an error
|
55
|
-
|
81
|
+
unless category_keys.include?(category)
|
56
82
|
if @auto_categorize
|
57
83
|
add_category(category)
|
58
84
|
else
|
59
|
-
raise CategoryNotFoundError
|
85
|
+
raise CategoryNotFoundError, "Cannot train; category #{category} does not exist"
|
60
86
|
end
|
61
87
|
end
|
62
88
|
|
63
|
-
|
64
|
-
|
65
|
-
@
|
66
|
-
@
|
67
|
-
@total_words += count
|
89
|
+
word_hash.each do |word, count|
|
90
|
+
@backend.update_category_word_frequency(category, word, count)
|
91
|
+
@backend.update_category_word_count(category, count)
|
92
|
+
@backend.update_total_words(count)
|
68
93
|
end
|
94
|
+
@backend.update_total_trainings(1)
|
95
|
+
@backend.update_category_training_count(category, 1)
|
69
96
|
end
|
70
97
|
|
71
98
|
# Provides a untraining method for all categories specified in Bayes#new
|
@@ -76,23 +103,26 @@ module ClassifierReborn
|
|
76
103
|
# b.train :this, "This text"
|
77
104
|
# b.untrain :this, "This text"
|
78
105
|
def untrain(category, text)
|
106
|
+
word_hash = Hasher.word_hash(text, @enable_stemmer,
|
107
|
+
tokenizer: @tokenizer, token_filters: @token_filters)
|
108
|
+
return if word_hash.empty?
|
109
|
+
|
79
110
|
category = CategoryNamer.prepare_name(category)
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
end
|
89
|
-
|
90
|
-
if @category_word_count[category] >= count
|
91
|
-
@category_word_count[category] -= count
|
92
|
-
end
|
93
|
-
@total_words -= count
|
111
|
+
word_hash.each do |word, count|
|
112
|
+
next if @backend.total_words < 0
|
113
|
+
|
114
|
+
orig = @backend.category_word_frequency(category, word) || 0
|
115
|
+
@backend.update_category_word_frequency(category, word, -count)
|
116
|
+
if @backend.category_word_frequency(category, word) <= 0
|
117
|
+
@backend.delete_category_word(category, word)
|
118
|
+
count = orig
|
94
119
|
end
|
120
|
+
|
121
|
+
@backend.update_category_word_count(category, -count) if @backend.category_word_count(category) >= count
|
122
|
+
@backend.update_total_words(-count)
|
95
123
|
end
|
124
|
+
@backend.update_total_trainings(-1)
|
125
|
+
@backend.update_category_training_count(category, -1)
|
96
126
|
end
|
97
127
|
|
98
128
|
# Returns the scores in each category the provided +text+. E.g.,
|
@@ -100,21 +130,27 @@ module ClassifierReborn
|
|
100
130
|
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
101
131
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
102
132
|
def classifications(text)
|
103
|
-
score =
|
104
|
-
word_hash = Hasher.word_hash(text, @
|
105
|
-
|
106
|
-
|
133
|
+
score = {}
|
134
|
+
word_hash = Hasher.word_hash(text, @enable_stemmer,
|
135
|
+
tokenizer: @tokenizer, token_filters: @token_filters)
|
136
|
+
if word_hash.empty?
|
137
|
+
category_keys.each do |category|
|
138
|
+
score[category.to_s] = Float::INFINITY
|
139
|
+
end
|
140
|
+
return score
|
141
|
+
end
|
142
|
+
category_keys.each do |category|
|
107
143
|
score[category.to_s] = 0
|
108
|
-
total = (@category_word_count
|
109
|
-
word_hash.each do |word,
|
110
|
-
s =
|
111
|
-
score[category.to_s] += Math.log(s/total)
|
144
|
+
total = (@backend.category_word_count(category) || 1).to_f
|
145
|
+
word_hash.each do |word, _count|
|
146
|
+
s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1
|
147
|
+
score[category.to_s] += Math.log(s / total)
|
112
148
|
end
|
113
149
|
# now add prior probability for the category
|
114
|
-
s = @
|
115
|
-
score[category.to_s] += Math.log(s /
|
150
|
+
s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1
|
151
|
+
score[category.to_s] += Math.log(s / @backend.total_trainings.to_f)
|
116
152
|
end
|
117
|
-
|
153
|
+
score
|
118
154
|
end
|
119
155
|
|
120
156
|
# Returns the classification of the provided +text+, which is one of the
|
@@ -128,21 +164,15 @@ module ClassifierReborn
|
|
128
164
|
# Return the classification without the score
|
129
165
|
def classify(text)
|
130
166
|
result, score = classify_with_score(text)
|
131
|
-
if threshold_enabled?
|
132
|
-
|
133
|
-
end
|
134
|
-
return result
|
167
|
+
result = nil if threshold_enabled? && (score < @threshold || score == Float::INFINITY)
|
168
|
+
result
|
135
169
|
end
|
136
170
|
|
137
171
|
# Retrieve the current threshold value
|
138
|
-
|
139
|
-
@threshold
|
140
|
-
end
|
172
|
+
attr_reader :threshold
|
141
173
|
|
142
174
|
# Dynamically set the threshold value
|
143
|
-
|
144
|
-
@threshold = a_float
|
145
|
-
end
|
175
|
+
attr_writer :threshold
|
146
176
|
|
147
177
|
# Dynamically enable threshold for classify results
|
148
178
|
def enable_threshold
|
@@ -164,6 +194,16 @@ module ClassifierReborn
|
|
164
194
|
!@enable_threshold
|
165
195
|
end
|
166
196
|
|
197
|
+
# Is word stemming enabled?
|
198
|
+
def stemmer_enabled?
|
199
|
+
@enable_stemmer
|
200
|
+
end
|
201
|
+
|
202
|
+
# Is word stemming disabled?
|
203
|
+
def stemmer_disabled?
|
204
|
+
!@enable_stemmer
|
205
|
+
end
|
206
|
+
|
167
207
|
# Provides training and untraining methods for the categories specified in Bayes#new
|
168
208
|
# For example:
|
169
209
|
# b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
|
@@ -174,21 +214,29 @@ module ClassifierReborn
|
|
174
214
|
def method_missing(name, *args)
|
175
215
|
cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
|
176
216
|
category = CategoryNamer.prepare_name(cleaned_name)
|
177
|
-
if
|
178
|
-
args.each { |text| eval("#{
|
217
|
+
if category_keys.include?(category)
|
218
|
+
args.each { |text| eval("#{Regexp.last_match(1)}train(category, text)") }
|
179
219
|
elsif name.to_s =~ /(un)?train_([\w]+)/
|
180
220
|
raise StandardError, "No such category: #{category}"
|
181
221
|
else
|
182
|
-
super
|
222
|
+
super # raise StandardError, "No such method: #{name}"
|
183
223
|
end
|
184
224
|
end
|
185
225
|
|
186
226
|
# Provides a list of category names
|
187
227
|
# For example:
|
188
228
|
# b.categories
|
189
|
-
# => [
|
190
|
-
def categories
|
191
|
-
|
229
|
+
# => ["This", "That", "The other"]
|
230
|
+
def categories
|
231
|
+
category_keys.collect(&:to_s)
|
232
|
+
end
|
233
|
+
|
234
|
+
# Provides a list of category keys as symbols
|
235
|
+
# For example:
|
236
|
+
# b.categories
|
237
|
+
# => [:This, :That, :"The other"]
|
238
|
+
def category_keys
|
239
|
+
@backend.category_keys
|
192
240
|
end
|
193
241
|
|
194
242
|
# Allows you to add categories to the classifier.
|
@@ -200,9 +248,37 @@ module ClassifierReborn
|
|
200
248
|
# more criteria than the trained selective categories. In short,
|
201
249
|
# try to initialize your categories at initialization.
|
202
250
|
def add_category(category)
|
203
|
-
|
251
|
+
category = CategoryNamer.prepare_name(category)
|
252
|
+
@backend.add_category(category)
|
204
253
|
end
|
205
254
|
|
206
255
|
alias append_category add_category
|
256
|
+
|
257
|
+
def reset
|
258
|
+
@backend.reset
|
259
|
+
populate_initial_categories
|
260
|
+
end
|
261
|
+
|
262
|
+
private
|
263
|
+
|
264
|
+
def populate_initial_categories
|
265
|
+
@initial_categories.each do |c|
|
266
|
+
add_category(c)
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# Overwrites the default stopwords for current language with supplied list of stopwords or file
|
271
|
+
def custom_stopwords(stopwords)
|
272
|
+
unless stopwords.is_a?(Enumerable)
|
273
|
+
if stopwords.strip.empty?
|
274
|
+
stopwords = []
|
275
|
+
elsif File.exist?(stopwords)
|
276
|
+
stopwords = File.read(stopwords).force_encoding('utf-8').split
|
277
|
+
else
|
278
|
+
return # Do not overwrite the default
|
279
|
+
end
|
280
|
+
end
|
281
|
+
TokenFilter::Stopword::STOPWORDS[@language] = Set.new stopwords
|
282
|
+
end
|
207
283
|
end
|
208
284
|
end
|
@@ -1,17 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
5
|
# License:: LGPL
|
4
6
|
|
5
|
-
require 'fast_stemmer'
|
6
7
|
require 'classifier-reborn/extensions/hasher'
|
7
8
|
|
8
9
|
module ClassifierReborn
|
9
10
|
module CategoryNamer
|
10
|
-
|
11
|
-
|
11
|
+
module_function
|
12
|
+
|
13
|
+
def prepare_name(name)
|
12
14
|
return name if name.is_a?(Symbol)
|
13
15
|
|
14
|
-
name.to_s.
|
16
|
+
name.to_s.tr('_', ' ').capitalize.intern
|
15
17
|
end
|
16
18
|
end
|
17
19
|
end
|
@@ -1,59 +1,42 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
3
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
4
5
|
# License:: LGPL
|
5
6
|
|
6
7
|
require 'set'
|
7
8
|
|
9
|
+
require_relative 'tokenizer/whitespace'
|
10
|
+
require_relative 'token_filter/stopword'
|
11
|
+
require_relative 'token_filter/stemmer'
|
12
|
+
|
8
13
|
module ClassifierReborn
|
9
14
|
module Hasher
|
10
|
-
|
11
|
-
|
12
|
-
extend self
|
15
|
+
module_function
|
13
16
|
|
14
17
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
15
18
|
# interned, and indexes to its frequency in the document.
|
16
|
-
def word_hash(str,
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
|
25
|
-
end
|
26
|
-
|
27
|
-
def word_hash_for_words(words, language = 'en')
|
28
|
-
d = Hash.new(0)
|
29
|
-
words.each do |word|
|
30
|
-
if word.length > 2 && !STOPWORDS[language].include?(word)
|
31
|
-
d[word.stem.intern] += 1
|
19
|
+
def word_hash(str, enable_stemmer = true,
|
20
|
+
tokenizer: Tokenizer::Whitespace,
|
21
|
+
token_filters: [TokenFilter::Stopword])
|
22
|
+
if token_filters.include?(TokenFilter::Stemmer)
|
23
|
+
unless enable_stemmer
|
24
|
+
token_filters.reject! do |token_filter|
|
25
|
+
token_filter == TokenFilter::Stemmer
|
26
|
+
end
|
32
27
|
end
|
28
|
+
else
|
29
|
+
token_filters << TokenFilter::Stemmer if enable_stemmer
|
30
|
+
end
|
31
|
+
words = tokenizer.call(str)
|
32
|
+
token_filters.each do |token_filter|
|
33
|
+
words = token_filter.call(words)
|
33
34
|
end
|
34
|
-
return d
|
35
|
-
end
|
36
|
-
|
37
|
-
def word_hash_for_symbols(words)
|
38
35
|
d = Hash.new(0)
|
39
36
|
words.each do |word|
|
40
37
|
d[word.intern] += 1
|
41
38
|
end
|
42
|
-
|
43
|
-
end
|
44
|
-
|
45
|
-
# Create a lazily-loaded hash of stopword data
|
46
|
-
STOPWORDS = Hash.new do |hash, language|
|
47
|
-
hash[language] = []
|
48
|
-
|
49
|
-
STOPWORDS_PATH.each do |path|
|
50
|
-
if File.exist?(File.join(path, language))
|
51
|
-
hash[language] = Set.new File.read(File.join(path, language.to_s)).split
|
52
|
-
break
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
hash[language]
|
39
|
+
d
|
57
40
|
end
|
58
41
|
end
|
59
42
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
5
|
+
# License:: LGPL
|
6
|
+
|
7
|
+
module ClassifierReborn
|
8
|
+
module TokenFilter
|
9
|
+
# This filter converts given tokens to their stemmed versions.
|
10
|
+
module Stemmer
|
11
|
+
module_function
|
12
|
+
|
13
|
+
def call(tokens)
|
14
|
+
tokens.collect do |token|
|
15
|
+
if token.stemmable?
|
16
|
+
token.stem
|
17
|
+
else
|
18
|
+
token
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|