classifier-reborn 2.0.4 → 2.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/LICENSE +74 -1
- data/README.markdown +57 -207
- data/data/stopwords/ar +104 -0
- data/data/stopwords/bn +362 -0
- data/data/stopwords/hi +97 -0
- data/data/stopwords/ja +43 -0
- data/data/stopwords/ru +420 -0
- data/data/stopwords/tr +199 -30
- data/data/stopwords/vi +647 -0
- data/data/stopwords/zh +125 -0
- data/lib/classifier-reborn/backends/bayes_memory_backend.rb +77 -0
- data/lib/classifier-reborn/backends/bayes_redis_backend.rb +109 -0
- data/lib/classifier-reborn/backends/no_redis_error.rb +14 -0
- data/lib/classifier-reborn/bayes.rb +141 -65
- data/lib/classifier-reborn/category_namer.rb +6 -4
- data/lib/classifier-reborn/extensions/hasher.rb +22 -39
- data/lib/classifier-reborn/extensions/token_filter/stemmer.rb +24 -0
- data/lib/classifier-reborn/extensions/token_filter/stopword.rb +48 -0
- data/lib/classifier-reborn/extensions/token_filter/symbol.rb +20 -0
- data/lib/classifier-reborn/extensions/tokenizer/token.rb +36 -0
- data/lib/classifier-reborn/extensions/tokenizer/whitespace.rb +28 -0
- data/lib/classifier-reborn/extensions/vector.rb +35 -28
- data/lib/classifier-reborn/extensions/vector_serialize.rb +10 -10
- data/lib/classifier-reborn/extensions/zero_vector.rb +7 -0
- data/lib/classifier-reborn/lsi/cached_content_node.rb +6 -5
- data/lib/classifier-reborn/lsi/content_node.rb +35 -25
- data/lib/classifier-reborn/lsi/summarizer.rb +7 -5
- data/lib/classifier-reborn/lsi/word_list.rb +5 -6
- data/lib/classifier-reborn/lsi.rb +166 -94
- data/lib/classifier-reborn/validators/classifier_validator.rb +170 -0
- data/lib/classifier-reborn/version.rb +3 -1
- data/lib/classifier-reborn.rb +12 -1
- metadata +98 -17
- data/bin/bayes.rb +0 -36
- data/bin/summarize.rb +0 -16
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ClassifierReborn
|
4
|
+
class BayesMemoryBackend
|
5
|
+
attr_reader :total_words, :total_trainings
|
6
|
+
|
7
|
+
# This class provides Memory as the storage backend for the classifier data structures
|
8
|
+
def initialize
|
9
|
+
@total_words = 0
|
10
|
+
@total_trainings = 0
|
11
|
+
@category_counts = {}
|
12
|
+
@categories = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
def update_total_words(diff)
|
16
|
+
@total_words += diff
|
17
|
+
end
|
18
|
+
|
19
|
+
def update_total_trainings(diff)
|
20
|
+
@total_trainings += diff
|
21
|
+
end
|
22
|
+
|
23
|
+
def category_training_count(category)
|
24
|
+
category_counts(category)[:training]
|
25
|
+
end
|
26
|
+
|
27
|
+
def update_category_training_count(category, diff)
|
28
|
+
category_counts(category)[:training] += diff
|
29
|
+
end
|
30
|
+
|
31
|
+
def category_has_trainings?(category)
|
32
|
+
@category_counts.key?(category) && category_training_count(category) > 0
|
33
|
+
end
|
34
|
+
|
35
|
+
def category_word_count(category)
|
36
|
+
category_counts(category)[:word]
|
37
|
+
end
|
38
|
+
|
39
|
+
def update_category_word_count(category, diff)
|
40
|
+
category_counts(category)[:word] += diff
|
41
|
+
end
|
42
|
+
|
43
|
+
def add_category(category)
|
44
|
+
@categories[category] ||= Hash.new(0)
|
45
|
+
end
|
46
|
+
|
47
|
+
def category_keys
|
48
|
+
@categories.keys
|
49
|
+
end
|
50
|
+
|
51
|
+
def category_word_frequency(category, word)
|
52
|
+
@categories[category][word]
|
53
|
+
end
|
54
|
+
|
55
|
+
def update_category_word_frequency(category, word, diff)
|
56
|
+
@categories[category][word] += diff
|
57
|
+
end
|
58
|
+
|
59
|
+
def delete_category_word(category, word)
|
60
|
+
@categories[category].delete(word)
|
61
|
+
end
|
62
|
+
|
63
|
+
def word_in_category?(category, word)
|
64
|
+
@categories[category].key?(word)
|
65
|
+
end
|
66
|
+
|
67
|
+
def reset
|
68
|
+
initialize
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
def category_counts(category)
|
74
|
+
@category_counts[category] ||= { training: 0, word: 0 }
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'no_redis_error'
|
4
|
+
# require redis when we run #intialize. This way only people using this backend
|
5
|
+
# will need to install and load the backend without having to
|
6
|
+
# require 'classifier-reborn/backends/bayes_redis_backend'
|
7
|
+
|
8
|
+
module ClassifierReborn
|
9
|
+
# This class provides Redis as the storage backend for the classifier data structures
|
10
|
+
class BayesRedisBackend
|
11
|
+
# The class can be created with the same arguments that the redis gem accepts
|
12
|
+
# E.g.,
|
13
|
+
# b = ClassifierReborn::BayesRedisBackend.new
|
14
|
+
# b = ClassifierReborn::BayesRedisBackend.new host: "10.0.1.1", port: 6380, db: 2
|
15
|
+
# b = ClassifierReborn::BayesRedisBackend.new url: "redis://:secret@10.0.1.1:6380/2"
|
16
|
+
#
|
17
|
+
# Options available are:
|
18
|
+
# url: lambda { ENV["REDIS_URL"] }
|
19
|
+
# scheme: "redis"
|
20
|
+
# host: "127.0.0.1"
|
21
|
+
# port: 6379
|
22
|
+
# path: nil
|
23
|
+
# timeout: 5.0
|
24
|
+
# password: nil
|
25
|
+
# db: 0
|
26
|
+
# driver: nil
|
27
|
+
# id: nil
|
28
|
+
# tcp_keepalive: 0
|
29
|
+
# reconnect_attempts: 1
|
30
|
+
# inherit_socket: false
|
31
|
+
def initialize(options = {})
|
32
|
+
begin # because some people don't have redis installed
|
33
|
+
require 'redis'
|
34
|
+
rescue LoadError
|
35
|
+
raise NoRedisError
|
36
|
+
end
|
37
|
+
|
38
|
+
@redis = Redis.new(options)
|
39
|
+
@redis.setnx(:total_words, 0)
|
40
|
+
@redis.setnx(:total_trainings, 0)
|
41
|
+
end
|
42
|
+
|
43
|
+
def total_words
|
44
|
+
@redis.get(:total_words).to_i
|
45
|
+
end
|
46
|
+
|
47
|
+
def update_total_words(diff)
|
48
|
+
@redis.incrby(:total_words, diff)
|
49
|
+
end
|
50
|
+
|
51
|
+
def total_trainings
|
52
|
+
@redis.get(:total_trainings).to_i
|
53
|
+
end
|
54
|
+
|
55
|
+
def update_total_trainings(diff)
|
56
|
+
@redis.incrby(:total_trainings, diff)
|
57
|
+
end
|
58
|
+
|
59
|
+
def category_training_count(category)
|
60
|
+
@redis.hget(:category_training_count, category).to_i
|
61
|
+
end
|
62
|
+
|
63
|
+
def update_category_training_count(category, diff)
|
64
|
+
@redis.hincrby(:category_training_count, category, diff)
|
65
|
+
end
|
66
|
+
|
67
|
+
def category_has_trainings?(category)
|
68
|
+
category_training_count(category) > 0
|
69
|
+
end
|
70
|
+
|
71
|
+
def category_word_count(category)
|
72
|
+
@redis.hget(:category_word_count, category).to_i
|
73
|
+
end
|
74
|
+
|
75
|
+
def update_category_word_count(category, diff)
|
76
|
+
@redis.hincrby(:category_word_count, category, diff)
|
77
|
+
end
|
78
|
+
|
79
|
+
def add_category(category)
|
80
|
+
@redis.sadd(:category_keys, category)
|
81
|
+
end
|
82
|
+
|
83
|
+
def category_keys
|
84
|
+
@redis.smembers(:category_keys).map(&:intern)
|
85
|
+
end
|
86
|
+
|
87
|
+
def category_word_frequency(category, word)
|
88
|
+
@redis.hget(category, word).to_i
|
89
|
+
end
|
90
|
+
|
91
|
+
def update_category_word_frequency(category, word, diff)
|
92
|
+
@redis.hincrby(category, word, diff)
|
93
|
+
end
|
94
|
+
|
95
|
+
def delete_category_word(category, word)
|
96
|
+
@redis.hdel(category, word)
|
97
|
+
end
|
98
|
+
|
99
|
+
def word_in_category?(category, word)
|
100
|
+
@redis.hexists(category, word)
|
101
|
+
end
|
102
|
+
|
103
|
+
def reset
|
104
|
+
@redis.flushdb
|
105
|
+
@redis.set(:total_words, 0)
|
106
|
+
@redis.set(:total_trainings, 0)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class NoRedisError < RuntimeError
|
4
|
+
def initialize
|
5
|
+
msg =
|
6
|
+
%q(The Redis Backend can only be used if Redis is installed.
|
7
|
+
This error is raised from 'lib/classifier-reborn/backends/bayes_redis_backend.rb'.
|
8
|
+
If you have encountered this error and would like to use the Redis Backend,
|
9
|
+
please run 'gem install redis' or include 'gem "redis"' in
|
10
|
+
your gemfile. For more info see https://github.com/jekyll/classifier-reborn#usage.
|
11
|
+
)
|
12
|
+
super(msg)
|
13
|
+
end
|
14
|
+
end
|
@@ -1,8 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
5
|
# License:: LGPL
|
4
6
|
|
7
|
+
require 'set'
|
8
|
+
|
9
|
+
require_relative 'extensions/tokenizer/whitespace'
|
10
|
+
require_relative 'extensions/token_filter/stopword'
|
11
|
+
require_relative 'extensions/token_filter/stemmer'
|
5
12
|
require_relative 'category_namer'
|
13
|
+
require_relative 'backends/bayes_memory_backend'
|
14
|
+
require_relative 'backends/bayes_redis_backend'
|
6
15
|
|
7
16
|
module ClassifierReborn
|
8
17
|
class Bayes
|
@@ -13,33 +22,46 @@ module ClassifierReborn
|
|
13
22
|
# b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
14
23
|
#
|
15
24
|
# Options available are:
|
16
|
-
# language: 'en'
|
17
|
-
# auto_categorize: false
|
18
|
-
# enable_threshold: false
|
19
|
-
# threshold: 0.0
|
25
|
+
# language: 'en' Used to select language specific stop words
|
26
|
+
# auto_categorize: false When true, enables ability to dynamically declare a category; the default is true if no initial categories are provided
|
27
|
+
# enable_threshold: false When true, enables a threshold requirement for classifition
|
28
|
+
# threshold: 0.0 Default threshold, only used when enabled
|
29
|
+
# enable_stemmer: true When false, disables word stemming
|
30
|
+
# stopwords: nil Accepts path to a text file or an array of words, when supplied, overwrites the default stopwords; assign empty string or array to disable stopwords
|
31
|
+
# backend: BayesMemoryBackend.new Alternatively, BayesRedisBackend.new for persistent storage
|
20
32
|
def initialize(*args)
|
21
|
-
@
|
22
|
-
options = { language:
|
23
|
-
auto_categorize: false,
|
33
|
+
@initial_categories = []
|
34
|
+
options = { language: 'en',
|
24
35
|
enable_threshold: false,
|
25
|
-
threshold:
|
26
|
-
|
27
|
-
|
28
|
-
|
36
|
+
threshold: 0.0,
|
37
|
+
enable_stemmer: true,
|
38
|
+
backend: BayesMemoryBackend.new }
|
39
|
+
args.flatten.each do |arg|
|
40
|
+
if arg.is_a?(Hash)
|
29
41
|
options.merge!(arg)
|
30
42
|
else
|
31
|
-
|
43
|
+
@initial_categories.push(arg)
|
32
44
|
end
|
33
|
-
|
45
|
+
end
|
34
46
|
|
35
|
-
|
36
|
-
|
37
|
-
|
47
|
+
unless options.key?(:auto_categorize)
|
48
|
+
options[:auto_categorize] = @initial_categories.empty? ? true : false
|
49
|
+
end
|
38
50
|
|
39
51
|
@language = options[:language]
|
40
52
|
@auto_categorize = options[:auto_categorize]
|
41
53
|
@enable_threshold = options[:enable_threshold]
|
42
54
|
@threshold = options[:threshold]
|
55
|
+
@enable_stemmer = options[:enable_stemmer]
|
56
|
+
@backend = options[:backend]
|
57
|
+
@tokenizer = options[:tokenizer] || Tokenizer::Whitespace
|
58
|
+
@token_filters = options[:token_filters] || [TokenFilter::Stopword]
|
59
|
+
@token_filters << TokenFilter::Stemmer if @enable_stemmer && !@token_filters.include?(TokenFilter::Stemmer)
|
60
|
+
TokenFilter::Stopword.language = @language if @token_filters.include?(TokenFilter::Stopword)
|
61
|
+
|
62
|
+
populate_initial_categories
|
63
|
+
|
64
|
+
custom_stopwords options[:stopwords] if options.key?(:stopwords)
|
43
65
|
end
|
44
66
|
|
45
67
|
# Provides a general training method for all categories specified in Bayes#new
|
@@ -49,23 +71,28 @@ module ClassifierReborn
|
|
49
71
|
# b.train "that", "That text"
|
50
72
|
# b.train "The other", "The other text"
|
51
73
|
def train(category, text)
|
74
|
+
word_hash = Hasher.word_hash(text, @enable_stemmer,
|
75
|
+
tokenizer: @tokenizer, token_filters: @token_filters)
|
76
|
+
return if word_hash.empty?
|
77
|
+
|
52
78
|
category = CategoryNamer.prepare_name(category)
|
53
79
|
|
54
80
|
# Add the category dynamically or raise an error
|
55
|
-
|
81
|
+
unless category_keys.include?(category)
|
56
82
|
if @auto_categorize
|
57
83
|
add_category(category)
|
58
84
|
else
|
59
|
-
raise CategoryNotFoundError
|
85
|
+
raise CategoryNotFoundError, "Cannot train; category #{category} does not exist"
|
60
86
|
end
|
61
87
|
end
|
62
88
|
|
63
|
-
|
64
|
-
|
65
|
-
@
|
66
|
-
@
|
67
|
-
@total_words += count
|
89
|
+
word_hash.each do |word, count|
|
90
|
+
@backend.update_category_word_frequency(category, word, count)
|
91
|
+
@backend.update_category_word_count(category, count)
|
92
|
+
@backend.update_total_words(count)
|
68
93
|
end
|
94
|
+
@backend.update_total_trainings(1)
|
95
|
+
@backend.update_category_training_count(category, 1)
|
69
96
|
end
|
70
97
|
|
71
98
|
# Provides a untraining method for all categories specified in Bayes#new
|
@@ -76,23 +103,26 @@ module ClassifierReborn
|
|
76
103
|
# b.train :this, "This text"
|
77
104
|
# b.untrain :this, "This text"
|
78
105
|
def untrain(category, text)
|
106
|
+
word_hash = Hasher.word_hash(text, @enable_stemmer,
|
107
|
+
tokenizer: @tokenizer, token_filters: @token_filters)
|
108
|
+
return if word_hash.empty?
|
109
|
+
|
79
110
|
category = CategoryNamer.prepare_name(category)
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
end
|
89
|
-
|
90
|
-
if @category_word_count[category] >= count
|
91
|
-
@category_word_count[category] -= count
|
92
|
-
end
|
93
|
-
@total_words -= count
|
111
|
+
word_hash.each do |word, count|
|
112
|
+
next if @backend.total_words < 0
|
113
|
+
|
114
|
+
orig = @backend.category_word_frequency(category, word) || 0
|
115
|
+
@backend.update_category_word_frequency(category, word, -count)
|
116
|
+
if @backend.category_word_frequency(category, word) <= 0
|
117
|
+
@backend.delete_category_word(category, word)
|
118
|
+
count = orig
|
94
119
|
end
|
120
|
+
|
121
|
+
@backend.update_category_word_count(category, -count) if @backend.category_word_count(category) >= count
|
122
|
+
@backend.update_total_words(-count)
|
95
123
|
end
|
124
|
+
@backend.update_total_trainings(-1)
|
125
|
+
@backend.update_category_training_count(category, -1)
|
96
126
|
end
|
97
127
|
|
98
128
|
# Returns the scores in each category the provided +text+. E.g.,
|
@@ -100,21 +130,27 @@ module ClassifierReborn
|
|
100
130
|
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
101
131
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
102
132
|
def classifications(text)
|
103
|
-
score =
|
104
|
-
word_hash = Hasher.word_hash(text, @
|
105
|
-
|
106
|
-
|
133
|
+
score = {}
|
134
|
+
word_hash = Hasher.word_hash(text, @enable_stemmer,
|
135
|
+
tokenizer: @tokenizer, token_filters: @token_filters)
|
136
|
+
if word_hash.empty?
|
137
|
+
category_keys.each do |category|
|
138
|
+
score[category.to_s] = Float::INFINITY
|
139
|
+
end
|
140
|
+
return score
|
141
|
+
end
|
142
|
+
category_keys.each do |category|
|
107
143
|
score[category.to_s] = 0
|
108
|
-
total = (@category_word_count
|
109
|
-
word_hash.each do |word,
|
110
|
-
s =
|
111
|
-
score[category.to_s] += Math.log(s/total)
|
144
|
+
total = (@backend.category_word_count(category) || 1).to_f
|
145
|
+
word_hash.each do |word, _count|
|
146
|
+
s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1
|
147
|
+
score[category.to_s] += Math.log(s / total)
|
112
148
|
end
|
113
149
|
# now add prior probability for the category
|
114
|
-
s = @
|
115
|
-
score[category.to_s] += Math.log(s /
|
150
|
+
s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1
|
151
|
+
score[category.to_s] += Math.log(s / @backend.total_trainings.to_f)
|
116
152
|
end
|
117
|
-
|
153
|
+
score
|
118
154
|
end
|
119
155
|
|
120
156
|
# Returns the classification of the provided +text+, which is one of the
|
@@ -128,21 +164,15 @@ module ClassifierReborn
|
|
128
164
|
# Return the classification without the score
|
129
165
|
def classify(text)
|
130
166
|
result, score = classify_with_score(text)
|
131
|
-
if threshold_enabled?
|
132
|
-
|
133
|
-
end
|
134
|
-
return result
|
167
|
+
result = nil if threshold_enabled? && (score < @threshold || score == Float::INFINITY)
|
168
|
+
result
|
135
169
|
end
|
136
170
|
|
137
171
|
# Retrieve the current threshold value
|
138
|
-
|
139
|
-
@threshold
|
140
|
-
end
|
172
|
+
attr_reader :threshold
|
141
173
|
|
142
174
|
# Dynamically set the threshold value
|
143
|
-
|
144
|
-
@threshold = a_float
|
145
|
-
end
|
175
|
+
attr_writer :threshold
|
146
176
|
|
147
177
|
# Dynamically enable threshold for classify results
|
148
178
|
def enable_threshold
|
@@ -164,6 +194,16 @@ module ClassifierReborn
|
|
164
194
|
!@enable_threshold
|
165
195
|
end
|
166
196
|
|
197
|
+
# Is word stemming enabled?
|
198
|
+
def stemmer_enabled?
|
199
|
+
@enable_stemmer
|
200
|
+
end
|
201
|
+
|
202
|
+
# Is word stemming disabled?
|
203
|
+
def stemmer_disabled?
|
204
|
+
!@enable_stemmer
|
205
|
+
end
|
206
|
+
|
167
207
|
# Provides training and untraining methods for the categories specified in Bayes#new
|
168
208
|
# For example:
|
169
209
|
# b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
|
@@ -174,21 +214,29 @@ module ClassifierReborn
|
|
174
214
|
def method_missing(name, *args)
|
175
215
|
cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
|
176
216
|
category = CategoryNamer.prepare_name(cleaned_name)
|
177
|
-
if
|
178
|
-
args.each { |text| eval("#{
|
217
|
+
if category_keys.include?(category)
|
218
|
+
args.each { |text| eval("#{Regexp.last_match(1)}train(category, text)") }
|
179
219
|
elsif name.to_s =~ /(un)?train_([\w]+)/
|
180
220
|
raise StandardError, "No such category: #{category}"
|
181
221
|
else
|
182
|
-
super
|
222
|
+
super # raise StandardError, "No such method: #{name}"
|
183
223
|
end
|
184
224
|
end
|
185
225
|
|
186
226
|
# Provides a list of category names
|
187
227
|
# For example:
|
188
228
|
# b.categories
|
189
|
-
# => [
|
190
|
-
def categories
|
191
|
-
|
229
|
+
# => ["This", "That", "The other"]
|
230
|
+
def categories
|
231
|
+
category_keys.collect(&:to_s)
|
232
|
+
end
|
233
|
+
|
234
|
+
# Provides a list of category keys as symbols
|
235
|
+
# For example:
|
236
|
+
# b.categories
|
237
|
+
# => [:This, :That, :"The other"]
|
238
|
+
def category_keys
|
239
|
+
@backend.category_keys
|
192
240
|
end
|
193
241
|
|
194
242
|
# Allows you to add categories to the classifier.
|
@@ -200,9 +248,37 @@ module ClassifierReborn
|
|
200
248
|
# more criteria than the trained selective categories. In short,
|
201
249
|
# try to initialize your categories at initialization.
|
202
250
|
def add_category(category)
|
203
|
-
|
251
|
+
category = CategoryNamer.prepare_name(category)
|
252
|
+
@backend.add_category(category)
|
204
253
|
end
|
205
254
|
|
206
255
|
alias append_category add_category
|
256
|
+
|
257
|
+
def reset
|
258
|
+
@backend.reset
|
259
|
+
populate_initial_categories
|
260
|
+
end
|
261
|
+
|
262
|
+
private
|
263
|
+
|
264
|
+
def populate_initial_categories
|
265
|
+
@initial_categories.each do |c|
|
266
|
+
add_category(c)
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# Overwrites the default stopwords for current language with supplied list of stopwords or file
|
271
|
+
def custom_stopwords(stopwords)
|
272
|
+
unless stopwords.is_a?(Enumerable)
|
273
|
+
if stopwords.strip.empty?
|
274
|
+
stopwords = []
|
275
|
+
elsif File.exist?(stopwords)
|
276
|
+
stopwords = File.read(stopwords).force_encoding('utf-8').split
|
277
|
+
else
|
278
|
+
return # Do not overwrite the default
|
279
|
+
end
|
280
|
+
end
|
281
|
+
TokenFilter::Stopword::STOPWORDS[@language] = Set.new stopwords
|
282
|
+
end
|
207
283
|
end
|
208
284
|
end
|
@@ -1,17 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
5
|
# License:: LGPL
|
4
6
|
|
5
|
-
require 'fast_stemmer'
|
6
7
|
require 'classifier-reborn/extensions/hasher'
|
7
8
|
|
8
9
|
module ClassifierReborn
|
9
10
|
module CategoryNamer
|
10
|
-
|
11
|
-
|
11
|
+
module_function
|
12
|
+
|
13
|
+
def prepare_name(name)
|
12
14
|
return name if name.is_a?(Symbol)
|
13
15
|
|
14
|
-
name.to_s.
|
16
|
+
name.to_s.tr('_', ' ').capitalize.intern
|
15
17
|
end
|
16
18
|
end
|
17
19
|
end
|
@@ -1,59 +1,42 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
3
4
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
4
5
|
# License:: LGPL
|
5
6
|
|
6
7
|
require 'set'
|
7
8
|
|
9
|
+
require_relative 'tokenizer/whitespace'
|
10
|
+
require_relative 'token_filter/stopword'
|
11
|
+
require_relative 'token_filter/stemmer'
|
12
|
+
|
8
13
|
module ClassifierReborn
|
9
14
|
module Hasher
|
10
|
-
|
11
|
-
|
12
|
-
extend self
|
15
|
+
module_function
|
13
16
|
|
14
17
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
15
18
|
# interned, and indexes to its frequency in the document.
|
16
|
-
def word_hash(str,
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
|
25
|
-
end
|
26
|
-
|
27
|
-
def word_hash_for_words(words, language = 'en')
|
28
|
-
d = Hash.new(0)
|
29
|
-
words.each do |word|
|
30
|
-
if word.length > 2 && !STOPWORDS[language].include?(word)
|
31
|
-
d[word.stem.intern] += 1
|
19
|
+
def word_hash(str, enable_stemmer = true,
|
20
|
+
tokenizer: Tokenizer::Whitespace,
|
21
|
+
token_filters: [TokenFilter::Stopword])
|
22
|
+
if token_filters.include?(TokenFilter::Stemmer)
|
23
|
+
unless enable_stemmer
|
24
|
+
token_filters.reject! do |token_filter|
|
25
|
+
token_filter == TokenFilter::Stemmer
|
26
|
+
end
|
32
27
|
end
|
28
|
+
else
|
29
|
+
token_filters << TokenFilter::Stemmer if enable_stemmer
|
30
|
+
end
|
31
|
+
words = tokenizer.call(str)
|
32
|
+
token_filters.each do |token_filter|
|
33
|
+
words = token_filter.call(words)
|
33
34
|
end
|
34
|
-
return d
|
35
|
-
end
|
36
|
-
|
37
|
-
def word_hash_for_symbols(words)
|
38
35
|
d = Hash.new(0)
|
39
36
|
words.each do |word|
|
40
37
|
d[word.intern] += 1
|
41
38
|
end
|
42
|
-
|
43
|
-
end
|
44
|
-
|
45
|
-
# Create a lazily-loaded hash of stopword data
|
46
|
-
STOPWORDS = Hash.new do |hash, language|
|
47
|
-
hash[language] = []
|
48
|
-
|
49
|
-
STOPWORDS_PATH.each do |path|
|
50
|
-
if File.exist?(File.join(path, language))
|
51
|
-
hash[language] = Set.new File.read(File.join(path, language.to_s)).split
|
52
|
-
break
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
hash[language]
|
39
|
+
d
|
57
40
|
end
|
58
41
|
end
|
59
42
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
4
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
5
|
+
# License:: LGPL
|
6
|
+
|
7
|
+
module ClassifierReborn
|
8
|
+
module TokenFilter
|
9
|
+
# This filter converts given tokens to their stemmed versions.
|
10
|
+
module Stemmer
|
11
|
+
module_function
|
12
|
+
|
13
|
+
def call(tokens)
|
14
|
+
tokens.collect do |token|
|
15
|
+
if token.stemmable?
|
16
|
+
token.stem
|
17
|
+
else
|
18
|
+
token
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|