classifier-reborn 2.0.4 → 2.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +5 -5
  2. data/LICENSE +74 -1
  3. data/README.markdown +57 -207
  4. data/data/stopwords/ar +104 -0
  5. data/data/stopwords/bn +362 -0
  6. data/data/stopwords/hi +97 -0
  7. data/data/stopwords/ja +43 -0
  8. data/data/stopwords/ru +420 -0
  9. data/data/stopwords/tr +199 -30
  10. data/data/stopwords/vi +647 -0
  11. data/data/stopwords/zh +125 -0
  12. data/lib/classifier-reborn/backends/bayes_memory_backend.rb +77 -0
  13. data/lib/classifier-reborn/backends/bayes_redis_backend.rb +109 -0
  14. data/lib/classifier-reborn/backends/no_redis_error.rb +14 -0
  15. data/lib/classifier-reborn/bayes.rb +141 -65
  16. data/lib/classifier-reborn/category_namer.rb +6 -4
  17. data/lib/classifier-reborn/extensions/hasher.rb +22 -39
  18. data/lib/classifier-reborn/extensions/token_filter/stemmer.rb +24 -0
  19. data/lib/classifier-reborn/extensions/token_filter/stopword.rb +48 -0
  20. data/lib/classifier-reborn/extensions/token_filter/symbol.rb +20 -0
  21. data/lib/classifier-reborn/extensions/tokenizer/token.rb +36 -0
  22. data/lib/classifier-reborn/extensions/tokenizer/whitespace.rb +28 -0
  23. data/lib/classifier-reborn/extensions/vector.rb +35 -28
  24. data/lib/classifier-reborn/extensions/vector_serialize.rb +10 -10
  25. data/lib/classifier-reborn/extensions/zero_vector.rb +7 -0
  26. data/lib/classifier-reborn/lsi/cached_content_node.rb +6 -5
  27. data/lib/classifier-reborn/lsi/content_node.rb +35 -25
  28. data/lib/classifier-reborn/lsi/summarizer.rb +7 -5
  29. data/lib/classifier-reborn/lsi/word_list.rb +5 -6
  30. data/lib/classifier-reborn/lsi.rb +166 -94
  31. data/lib/classifier-reborn/validators/classifier_validator.rb +170 -0
  32. data/lib/classifier-reborn/version.rb +3 -1
  33. data/lib/classifier-reborn.rb +12 -1
  34. metadata +98 -17
  35. data/bin/bayes.rb +0 -36
  36. data/bin/summarize.rb +0 -16
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClassifierReborn
4
+ class BayesMemoryBackend
5
+ attr_reader :total_words, :total_trainings
6
+
7
+ # This class provides Memory as the storage backend for the classifier data structures
8
+ def initialize
9
+ @total_words = 0
10
+ @total_trainings = 0
11
+ @category_counts = {}
12
+ @categories = {}
13
+ end
14
+
15
+ def update_total_words(diff)
16
+ @total_words += diff
17
+ end
18
+
19
+ def update_total_trainings(diff)
20
+ @total_trainings += diff
21
+ end
22
+
23
+ def category_training_count(category)
24
+ category_counts(category)[:training]
25
+ end
26
+
27
+ def update_category_training_count(category, diff)
28
+ category_counts(category)[:training] += diff
29
+ end
30
+
31
+ def category_has_trainings?(category)
32
+ @category_counts.key?(category) && category_training_count(category) > 0
33
+ end
34
+
35
+ def category_word_count(category)
36
+ category_counts(category)[:word]
37
+ end
38
+
39
+ def update_category_word_count(category, diff)
40
+ category_counts(category)[:word] += diff
41
+ end
42
+
43
+ def add_category(category)
44
+ @categories[category] ||= Hash.new(0)
45
+ end
46
+
47
+ def category_keys
48
+ @categories.keys
49
+ end
50
+
51
+ def category_word_frequency(category, word)
52
+ @categories[category][word]
53
+ end
54
+
55
+ def update_category_word_frequency(category, word, diff)
56
+ @categories[category][word] += diff
57
+ end
58
+
59
+ def delete_category_word(category, word)
60
+ @categories[category].delete(word)
61
+ end
62
+
63
+ def word_in_category?(category, word)
64
+ @categories[category].key?(word)
65
+ end
66
+
67
+ def reset
68
+ initialize
69
+ end
70
+
71
+ private
72
+
73
+ def category_counts(category)
74
+ @category_counts[category] ||= { training: 0, word: 0 }
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,109 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'no_redis_error'
4
+ # require redis when we run #intialize. This way only people using this backend
5
+ # will need to install and load the backend without having to
6
+ # require 'classifier-reborn/backends/bayes_redis_backend'
7
+
8
+ module ClassifierReborn
9
+ # This class provides Redis as the storage backend for the classifier data structures
10
+ class BayesRedisBackend
11
+ # The class can be created with the same arguments that the redis gem accepts
12
+ # E.g.,
13
+ # b = ClassifierReborn::BayesRedisBackend.new
14
+ # b = ClassifierReborn::BayesRedisBackend.new host: "10.0.1.1", port: 6380, db: 2
15
+ # b = ClassifierReborn::BayesRedisBackend.new url: "redis://:secret@10.0.1.1:6380/2"
16
+ #
17
+ # Options available are:
18
+ # url: lambda { ENV["REDIS_URL"] }
19
+ # scheme: "redis"
20
+ # host: "127.0.0.1"
21
+ # port: 6379
22
+ # path: nil
23
+ # timeout: 5.0
24
+ # password: nil
25
+ # db: 0
26
+ # driver: nil
27
+ # id: nil
28
+ # tcp_keepalive: 0
29
+ # reconnect_attempts: 1
30
+ # inherit_socket: false
31
+ def initialize(options = {})
32
+ begin # because some people don't have redis installed
33
+ require 'redis'
34
+ rescue LoadError
35
+ raise NoRedisError
36
+ end
37
+
38
+ @redis = Redis.new(options)
39
+ @redis.setnx(:total_words, 0)
40
+ @redis.setnx(:total_trainings, 0)
41
+ end
42
+
43
+ def total_words
44
+ @redis.get(:total_words).to_i
45
+ end
46
+
47
+ def update_total_words(diff)
48
+ @redis.incrby(:total_words, diff)
49
+ end
50
+
51
+ def total_trainings
52
+ @redis.get(:total_trainings).to_i
53
+ end
54
+
55
+ def update_total_trainings(diff)
56
+ @redis.incrby(:total_trainings, diff)
57
+ end
58
+
59
+ def category_training_count(category)
60
+ @redis.hget(:category_training_count, category).to_i
61
+ end
62
+
63
+ def update_category_training_count(category, diff)
64
+ @redis.hincrby(:category_training_count, category, diff)
65
+ end
66
+
67
+ def category_has_trainings?(category)
68
+ category_training_count(category) > 0
69
+ end
70
+
71
+ def category_word_count(category)
72
+ @redis.hget(:category_word_count, category).to_i
73
+ end
74
+
75
+ def update_category_word_count(category, diff)
76
+ @redis.hincrby(:category_word_count, category, diff)
77
+ end
78
+
79
+ def add_category(category)
80
+ @redis.sadd(:category_keys, category)
81
+ end
82
+
83
+ def category_keys
84
+ @redis.smembers(:category_keys).map(&:intern)
85
+ end
86
+
87
+ def category_word_frequency(category, word)
88
+ @redis.hget(category, word).to_i
89
+ end
90
+
91
+ def update_category_word_frequency(category, word, diff)
92
+ @redis.hincrby(category, word, diff)
93
+ end
94
+
95
+ def delete_category_word(category, word)
96
+ @redis.hdel(category, word)
97
+ end
98
+
99
+ def word_in_category?(category, word)
100
+ @redis.hexists(category, word)
101
+ end
102
+
103
+ def reset
104
+ @redis.flushdb
105
+ @redis.set(:total_words, 0)
106
+ @redis.set(:total_trainings, 0)
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NoRedisError < RuntimeError
4
+ def initialize
5
+ msg =
6
+ %q(The Redis Backend can only be used if Redis is installed.
7
+ This error is raised from 'lib/classifier-reborn/backends/bayes_redis_backend.rb'.
8
+ If you have encountered this error and would like to use the Redis Backend,
9
+ please run 'gem install redis' or include 'gem "redis"' in
10
+ your gemfile. For more info see https://github.com/jekyll/classifier-reborn#usage.
11
+ )
12
+ super(msg)
13
+ end
14
+ end
@@ -1,8 +1,17 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
4
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
5
  # License:: LGPL
4
6
 
7
+ require 'set'
8
+
9
+ require_relative 'extensions/tokenizer/whitespace'
10
+ require_relative 'extensions/token_filter/stopword'
11
+ require_relative 'extensions/token_filter/stemmer'
5
12
  require_relative 'category_namer'
13
+ require_relative 'backends/bayes_memory_backend'
14
+ require_relative 'backends/bayes_redis_backend'
6
15
 
7
16
  module ClassifierReborn
8
17
  class Bayes
@@ -13,33 +22,46 @@ module ClassifierReborn
13
22
  # b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
14
23
  #
15
24
  # Options available are:
16
- # language: 'en' Used to select language specific stop words
17
- # auto_categorize: false When true, enables ability to dynamically declare a category
18
- # enable_threshold: false When true, enables a threshold requirement for classifition
19
- # threshold: 0.0 Default threshold, only used when enabled
25
+ # language: 'en' Used to select language specific stop words
26
+ # auto_categorize: false When true, enables ability to dynamically declare a category; the default is true if no initial categories are provided
27
+ # enable_threshold: false When true, enables a threshold requirement for classifition
28
+ # threshold: 0.0 Default threshold, only used when enabled
29
+ # enable_stemmer: true When false, disables word stemming
30
+ # stopwords: nil Accepts path to a text file or an array of words, when supplied, overwrites the default stopwords; assign empty string or array to disable stopwords
31
+ # backend: BayesMemoryBackend.new Alternatively, BayesRedisBackend.new for persistent storage
20
32
  def initialize(*args)
21
- @categories = Hash.new
22
- options = { language: 'en',
23
- auto_categorize: false,
33
+ @initial_categories = []
34
+ options = { language: 'en',
24
35
  enable_threshold: false,
25
- threshold: 0.0
26
- }
27
- args.flatten.each { |arg|
28
- if arg.kind_of?(Hash)
36
+ threshold: 0.0,
37
+ enable_stemmer: true,
38
+ backend: BayesMemoryBackend.new }
39
+ args.flatten.each do |arg|
40
+ if arg.is_a?(Hash)
29
41
  options.merge!(arg)
30
42
  else
31
- add_category(arg)
43
+ @initial_categories.push(arg)
32
44
  end
33
- }
45
+ end
34
46
 
35
- @total_words = 0
36
- @category_counts = Hash.new(0)
37
- @category_word_count = Hash.new(0)
47
+ unless options.key?(:auto_categorize)
48
+ options[:auto_categorize] = @initial_categories.empty? ? true : false
49
+ end
38
50
 
39
51
  @language = options[:language]
40
52
  @auto_categorize = options[:auto_categorize]
41
53
  @enable_threshold = options[:enable_threshold]
42
54
  @threshold = options[:threshold]
55
+ @enable_stemmer = options[:enable_stemmer]
56
+ @backend = options[:backend]
57
+ @tokenizer = options[:tokenizer] || Tokenizer::Whitespace
58
+ @token_filters = options[:token_filters] || [TokenFilter::Stopword]
59
+ @token_filters << TokenFilter::Stemmer if @enable_stemmer && !@token_filters.include?(TokenFilter::Stemmer)
60
+ TokenFilter::Stopword.language = @language if @token_filters.include?(TokenFilter::Stopword)
61
+
62
+ populate_initial_categories
63
+
64
+ custom_stopwords options[:stopwords] if options.key?(:stopwords)
43
65
  end
44
66
 
45
67
  # Provides a general training method for all categories specified in Bayes#new
@@ -49,23 +71,28 @@ module ClassifierReborn
49
71
  # b.train "that", "That text"
50
72
  # b.train "The other", "The other text"
51
73
  def train(category, text)
74
+ word_hash = Hasher.word_hash(text, @enable_stemmer,
75
+ tokenizer: @tokenizer, token_filters: @token_filters)
76
+ return if word_hash.empty?
77
+
52
78
  category = CategoryNamer.prepare_name(category)
53
79
 
54
80
  # Add the category dynamically or raise an error
55
- if !@categories.has_key?(category)
81
+ unless category_keys.include?(category)
56
82
  if @auto_categorize
57
83
  add_category(category)
58
84
  else
59
- raise CategoryNotFoundError.new("Cannot train; category #{category} does not exist")
85
+ raise CategoryNotFoundError, "Cannot train; category #{category} does not exist"
60
86
  end
61
87
  end
62
88
 
63
- @category_counts[category] += 1
64
- Hasher.word_hash(text, @language).each do |word, count|
65
- @categories[category][word] += count
66
- @category_word_count[category] += count
67
- @total_words += count
89
+ word_hash.each do |word, count|
90
+ @backend.update_category_word_frequency(category, word, count)
91
+ @backend.update_category_word_count(category, count)
92
+ @backend.update_total_words(count)
68
93
  end
94
+ @backend.update_total_trainings(1)
95
+ @backend.update_category_training_count(category, 1)
69
96
  end
70
97
 
71
98
  # Provides a untraining method for all categories specified in Bayes#new
@@ -76,23 +103,26 @@ module ClassifierReborn
76
103
  # b.train :this, "This text"
77
104
  # b.untrain :this, "This text"
78
105
  def untrain(category, text)
106
+ word_hash = Hasher.word_hash(text, @enable_stemmer,
107
+ tokenizer: @tokenizer, token_filters: @token_filters)
108
+ return if word_hash.empty?
109
+
79
110
  category = CategoryNamer.prepare_name(category)
80
- @category_counts[category] -= 1
81
- Hasher.word_hash(text, @language).each do |word, count|
82
- if @total_words >= 0
83
- orig = @categories[category][word] || 0
84
- @categories[category][word] -= count
85
- if @categories[category][word] <= 0
86
- @categories[category].delete(word)
87
- count = orig
88
- end
89
-
90
- if @category_word_count[category] >= count
91
- @category_word_count[category] -= count
92
- end
93
- @total_words -= count
111
+ word_hash.each do |word, count|
112
+ next if @backend.total_words < 0
113
+
114
+ orig = @backend.category_word_frequency(category, word) || 0
115
+ @backend.update_category_word_frequency(category, word, -count)
116
+ if @backend.category_word_frequency(category, word) <= 0
117
+ @backend.delete_category_word(category, word)
118
+ count = orig
94
119
  end
120
+
121
+ @backend.update_category_word_count(category, -count) if @backend.category_word_count(category) >= count
122
+ @backend.update_total_words(-count)
95
123
  end
124
+ @backend.update_total_trainings(-1)
125
+ @backend.update_category_training_count(category, -1)
96
126
  end
97
127
 
98
128
  # Returns the scores in each category the provided +text+. E.g.,
@@ -100,21 +130,27 @@ module ClassifierReborn
100
130
  # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
101
131
  # The largest of these scores (the one closest to 0) is the one picked out by #classify
102
132
  def classifications(text)
103
- score = Hash.new
104
- word_hash = Hasher.word_hash(text, @language)
105
- training_count = @category_counts.values.reduce(:+).to_f
106
- @categories.each do |category, category_words|
133
+ score = {}
134
+ word_hash = Hasher.word_hash(text, @enable_stemmer,
135
+ tokenizer: @tokenizer, token_filters: @token_filters)
136
+ if word_hash.empty?
137
+ category_keys.each do |category|
138
+ score[category.to_s] = Float::INFINITY
139
+ end
140
+ return score
141
+ end
142
+ category_keys.each do |category|
107
143
  score[category.to_s] = 0
108
- total = (@category_word_count[category] || 1).to_f
109
- word_hash.each do |word, count|
110
- s = category_words.has_key?(word) ? category_words[word] : 0.1
111
- score[category.to_s] += Math.log(s/total)
144
+ total = (@backend.category_word_count(category) || 1).to_f
145
+ word_hash.each do |word, _count|
146
+ s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1
147
+ score[category.to_s] += Math.log(s / total)
112
148
  end
113
149
  # now add prior probability for the category
114
- s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
115
- score[category.to_s] += Math.log(s / training_count)
150
+ s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1
151
+ score[category.to_s] += Math.log(s / @backend.total_trainings.to_f)
116
152
  end
117
- return score
153
+ score
118
154
  end
119
155
 
120
156
  # Returns the classification of the provided +text+, which is one of the
@@ -128,21 +164,15 @@ module ClassifierReborn
128
164
  # Return the classification without the score
129
165
  def classify(text)
130
166
  result, score = classify_with_score(text)
131
- if threshold_enabled?
132
- result = nil if score < @threshold || score == Float::INFINITY
133
- end
134
- return result
167
+ result = nil if threshold_enabled? && (score < @threshold || score == Float::INFINITY)
168
+ result
135
169
  end
136
170
 
137
171
  # Retrieve the current threshold value
138
- def threshold
139
- @threshold
140
- end
172
+ attr_reader :threshold
141
173
 
142
174
  # Dynamically set the threshold value
143
- def threshold=(a_float)
144
- @threshold = a_float
145
- end
175
+ attr_writer :threshold
146
176
 
147
177
  # Dynamically enable threshold for classify results
148
178
  def enable_threshold
@@ -164,6 +194,16 @@ module ClassifierReborn
164
194
  !@enable_threshold
165
195
  end
166
196
 
197
+ # Is word stemming enabled?
198
+ def stemmer_enabled?
199
+ @enable_stemmer
200
+ end
201
+
202
+ # Is word stemming disabled?
203
+ def stemmer_disabled?
204
+ !@enable_stemmer
205
+ end
206
+
167
207
  # Provides training and untraining methods for the categories specified in Bayes#new
168
208
  # For example:
169
209
  # b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
@@ -174,21 +214,29 @@ module ClassifierReborn
174
214
  def method_missing(name, *args)
175
215
  cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
176
216
  category = CategoryNamer.prepare_name(cleaned_name)
177
- if @categories.has_key? category
178
- args.each { |text| eval("#{$1}train(category, text)") }
217
+ if category_keys.include?(category)
218
+ args.each { |text| eval("#{Regexp.last_match(1)}train(category, text)") }
179
219
  elsif name.to_s =~ /(un)?train_([\w]+)/
180
220
  raise StandardError, "No such category: #{category}"
181
221
  else
182
- super #raise StandardError, "No such method: #{name}"
222
+ super # raise StandardError, "No such method: #{name}"
183
223
  end
184
224
  end
185
225
 
186
226
  # Provides a list of category names
187
227
  # For example:
188
228
  # b.categories
189
- # => ['This', 'That', 'the_other']
190
- def categories # :nodoc:
191
- @categories.keys.collect {|c| c.to_s}
229
+ # => ["This", "That", "The other"]
230
+ def categories
231
+ category_keys.collect(&:to_s)
232
+ end
233
+
234
+ # Provides a list of category keys as symbols
235
+ # For example:
236
+ # b.categories
237
+ # => [:This, :That, :"The other"]
238
+ def category_keys
239
+ @backend.category_keys
192
240
  end
193
241
 
194
242
  # Allows you to add categories to the classifier.
@@ -200,9 +248,37 @@ module ClassifierReborn
200
248
  # more criteria than the trained selective categories. In short,
201
249
  # try to initialize your categories at initialization.
202
250
  def add_category(category)
203
- @categories[CategoryNamer.prepare_name(category)] ||= Hash.new(0)
251
+ category = CategoryNamer.prepare_name(category)
252
+ @backend.add_category(category)
204
253
  end
205
254
 
206
255
  alias append_category add_category
256
+
257
+ def reset
258
+ @backend.reset
259
+ populate_initial_categories
260
+ end
261
+
262
+ private
263
+
264
+ def populate_initial_categories
265
+ @initial_categories.each do |c|
266
+ add_category(c)
267
+ end
268
+ end
269
+
270
+ # Overwrites the default stopwords for current language with supplied list of stopwords or file
271
+ def custom_stopwords(stopwords)
272
+ unless stopwords.is_a?(Enumerable)
273
+ if stopwords.strip.empty?
274
+ stopwords = []
275
+ elsif File.exist?(stopwords)
276
+ stopwords = File.read(stopwords).force_encoding('utf-8').split
277
+ else
278
+ return # Do not overwrite the default
279
+ end
280
+ end
281
+ TokenFilter::Stopword::STOPWORDS[@language] = Set.new stopwords
282
+ end
207
283
  end
208
284
  end
@@ -1,17 +1,19 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
4
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
5
  # License:: LGPL
4
6
 
5
- require 'fast_stemmer'
6
7
  require 'classifier-reborn/extensions/hasher'
7
8
 
8
9
  module ClassifierReborn
9
10
  module CategoryNamer
10
- extend self
11
- def prepare_name(name)
11
+ module_function
12
+
13
+ def prepare_name(name)
12
14
  return name if name.is_a?(Symbol)
13
15
 
14
- name.to_s.gsub("_"," ").capitalize.intern
16
+ name.to_s.tr('_', ' ').capitalize.intern
15
17
  end
16
18
  end
17
19
  end
@@ -1,59 +1,42 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
3
4
  # Copyright:: Copyright (c) 2005 Lucas Carlson
4
5
  # License:: LGPL
5
6
 
6
7
  require 'set'
7
8
 
9
+ require_relative 'tokenizer/whitespace'
10
+ require_relative 'token_filter/stopword'
11
+ require_relative 'token_filter/stemmer'
12
+
8
13
  module ClassifierReborn
9
14
  module Hasher
10
- STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
11
-
12
- extend self
15
+ module_function
13
16
 
14
17
  # Return a Hash of strings => ints. Each word in the string is stemmed,
15
18
  # interned, and indexes to its frequency in the document.
16
- def word_hash(str, language = 'en')
17
- cleaned_word_hash = clean_word_hash(str, language)
18
- symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
19
- return cleaned_word_hash.merge(symbol_hash)
20
- end
21
-
22
- # Return a word hash without extra punctuation or short symbols, just stemmed words
23
- def clean_word_hash(str, language = 'en')
24
- word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
25
- end
26
-
27
- def word_hash_for_words(words, language = 'en')
28
- d = Hash.new(0)
29
- words.each do |word|
30
- if word.length > 2 && !STOPWORDS[language].include?(word)
31
- d[word.stem.intern] += 1
19
+ def word_hash(str, enable_stemmer = true,
20
+ tokenizer: Tokenizer::Whitespace,
21
+ token_filters: [TokenFilter::Stopword])
22
+ if token_filters.include?(TokenFilter::Stemmer)
23
+ unless enable_stemmer
24
+ token_filters.reject! do |token_filter|
25
+ token_filter == TokenFilter::Stemmer
26
+ end
32
27
  end
28
+ else
29
+ token_filters << TokenFilter::Stemmer if enable_stemmer
30
+ end
31
+ words = tokenizer.call(str)
32
+ token_filters.each do |token_filter|
33
+ words = token_filter.call(words)
33
34
  end
34
- return d
35
- end
36
-
37
- def word_hash_for_symbols(words)
38
35
  d = Hash.new(0)
39
36
  words.each do |word|
40
37
  d[word.intern] += 1
41
38
  end
42
- return d
43
- end
44
-
45
- # Create a lazily-loaded hash of stopword data
46
- STOPWORDS = Hash.new do |hash, language|
47
- hash[language] = []
48
-
49
- STOPWORDS_PATH.each do |path|
50
- if File.exist?(File.join(path, language))
51
- hash[language] = Set.new File.read(File.join(path, language.to_s)).split
52
- break
53
- end
54
- end
55
-
56
- hash[language]
39
+ d
57
40
  end
58
41
  end
59
42
  end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ module ClassifierReborn
8
+ module TokenFilter
9
+ # This filter converts given tokens to their stemmed versions.
10
+ module Stemmer
11
+ module_function
12
+
13
+ def call(tokens)
14
+ tokens.collect do |token|
15
+ if token.stemmable?
16
+ token.stem
17
+ else
18
+ token
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end