classifier-reborn 2.0.4 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +5 -5
  2. data/LICENSE +74 -1
  3. data/README.markdown +57 -207
  4. data/data/stopwords/ar +104 -0
  5. data/data/stopwords/bn +362 -0
  6. data/data/stopwords/hi +97 -0
  7. data/data/stopwords/ja +43 -0
  8. data/data/stopwords/ru +420 -0
  9. data/data/stopwords/tr +199 -30
  10. data/data/stopwords/vi +647 -0
  11. data/data/stopwords/zh +125 -0
  12. data/lib/classifier-reborn/backends/bayes_memory_backend.rb +77 -0
  13. data/lib/classifier-reborn/backends/bayes_redis_backend.rb +109 -0
  14. data/lib/classifier-reborn/backends/no_redis_error.rb +14 -0
  15. data/lib/classifier-reborn/bayes.rb +141 -65
  16. data/lib/classifier-reborn/category_namer.rb +6 -4
  17. data/lib/classifier-reborn/extensions/hasher.rb +22 -39
  18. data/lib/classifier-reborn/extensions/token_filter/stemmer.rb +24 -0
  19. data/lib/classifier-reborn/extensions/token_filter/stopword.rb +48 -0
  20. data/lib/classifier-reborn/extensions/token_filter/symbol.rb +20 -0
  21. data/lib/classifier-reborn/extensions/tokenizer/token.rb +36 -0
  22. data/lib/classifier-reborn/extensions/tokenizer/whitespace.rb +28 -0
  23. data/lib/classifier-reborn/extensions/vector.rb +35 -28
  24. data/lib/classifier-reborn/extensions/vector_serialize.rb +10 -10
  25. data/lib/classifier-reborn/extensions/zero_vector.rb +7 -0
  26. data/lib/classifier-reborn/lsi/cached_content_node.rb +6 -5
  27. data/lib/classifier-reborn/lsi/content_node.rb +35 -25
  28. data/lib/classifier-reborn/lsi/summarizer.rb +7 -5
  29. data/lib/classifier-reborn/lsi/word_list.rb +5 -6
  30. data/lib/classifier-reborn/lsi.rb +166 -94
  31. data/lib/classifier-reborn/validators/classifier_validator.rb +170 -0
  32. data/lib/classifier-reborn/version.rb +3 -1
  33. data/lib/classifier-reborn.rb +12 -1
  34. metadata +98 -17
  35. data/bin/bayes.rb +0 -36
  36. data/bin/summarize.rb +0 -16
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClassifierReborn
4
+ class BayesMemoryBackend
5
+ attr_reader :total_words, :total_trainings
6
+
7
+ # This class provides Memory as the storage backend for the classifier data structures
8
+ def initialize
9
+ @total_words = 0
10
+ @total_trainings = 0
11
+ @category_counts = {}
12
+ @categories = {}
13
+ end
14
+
15
+ def update_total_words(diff)
16
+ @total_words += diff
17
+ end
18
+
19
+ def update_total_trainings(diff)
20
+ @total_trainings += diff
21
+ end
22
+
23
+ def category_training_count(category)
24
+ category_counts(category)[:training]
25
+ end
26
+
27
+ def update_category_training_count(category, diff)
28
+ category_counts(category)[:training] += diff
29
+ end
30
+
31
+ def category_has_trainings?(category)
32
+ @category_counts.key?(category) && category_training_count(category) > 0
33
+ end
34
+
35
+ def category_word_count(category)
36
+ category_counts(category)[:word]
37
+ end
38
+
39
+ def update_category_word_count(category, diff)
40
+ category_counts(category)[:word] += diff
41
+ end
42
+
43
+ def add_category(category)
44
+ @categories[category] ||= Hash.new(0)
45
+ end
46
+
47
+ def category_keys
48
+ @categories.keys
49
+ end
50
+
51
+ def category_word_frequency(category, word)
52
+ @categories[category][word]
53
+ end
54
+
55
+ def update_category_word_frequency(category, word, diff)
56
+ @categories[category][word] += diff
57
+ end
58
+
59
+ def delete_category_word(category, word)
60
+ @categories[category].delete(word)
61
+ end
62
+
63
+ def word_in_category?(category, word)
64
+ @categories[category].key?(word)
65
+ end
66
+
67
+ def reset
68
+ initialize
69
+ end
70
+
71
+ private
72
+
73
+ def category_counts(category)
74
+ @category_counts[category] ||= { training: 0, word: 0 }
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,109 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'no_redis_error'
4
+ # require redis when we run #intialize. This way only people using this backend
5
+ # will need to install and load the backend without having to
6
+ # require 'classifier-reborn/backends/bayes_redis_backend'
7
+
8
+ module ClassifierReborn
9
+ # This class provides Redis as the storage backend for the classifier data structures
10
+ class BayesRedisBackend
11
+ # The class can be created with the same arguments that the redis gem accepts
12
+ # E.g.,
13
+ # b = ClassifierReborn::BayesRedisBackend.new
14
+ # b = ClassifierReborn::BayesRedisBackend.new host: "10.0.1.1", port: 6380, db: 2
15
+ # b = ClassifierReborn::BayesRedisBackend.new url: "redis://:secret@10.0.1.1:6380/2"
16
+ #
17
+ # Options available are:
18
+ # url: lambda { ENV["REDIS_URL"] }
19
+ # scheme: "redis"
20
+ # host: "127.0.0.1"
21
+ # port: 6379
22
+ # path: nil
23
+ # timeout: 5.0
24
+ # password: nil
25
+ # db: 0
26
+ # driver: nil
27
+ # id: nil
28
+ # tcp_keepalive: 0
29
+ # reconnect_attempts: 1
30
+ # inherit_socket: false
31
+ def initialize(options = {})
32
+ begin # because some people don't have redis installed
33
+ require 'redis'
34
+ rescue LoadError
35
+ raise NoRedisError
36
+ end
37
+
38
+ @redis = Redis.new(options)
39
+ @redis.setnx(:total_words, 0)
40
+ @redis.setnx(:total_trainings, 0)
41
+ end
42
+
43
+ def total_words
44
+ @redis.get(:total_words).to_i
45
+ end
46
+
47
+ def update_total_words(diff)
48
+ @redis.incrby(:total_words, diff)
49
+ end
50
+
51
+ def total_trainings
52
+ @redis.get(:total_trainings).to_i
53
+ end
54
+
55
+ def update_total_trainings(diff)
56
+ @redis.incrby(:total_trainings, diff)
57
+ end
58
+
59
+ def category_training_count(category)
60
+ @redis.hget(:category_training_count, category).to_i
61
+ end
62
+
63
+ def update_category_training_count(category, diff)
64
+ @redis.hincrby(:category_training_count, category, diff)
65
+ end
66
+
67
+ def category_has_trainings?(category)
68
+ category_training_count(category) > 0
69
+ end
70
+
71
+ def category_word_count(category)
72
+ @redis.hget(:category_word_count, category).to_i
73
+ end
74
+
75
+ def update_category_word_count(category, diff)
76
+ @redis.hincrby(:category_word_count, category, diff)
77
+ end
78
+
79
+ def add_category(category)
80
+ @redis.sadd(:category_keys, category)
81
+ end
82
+
83
+ def category_keys
84
+ @redis.smembers(:category_keys).map(&:intern)
85
+ end
86
+
87
+ def category_word_frequency(category, word)
88
+ @redis.hget(category, word).to_i
89
+ end
90
+
91
+ def update_category_word_frequency(category, word, diff)
92
+ @redis.hincrby(category, word, diff)
93
+ end
94
+
95
+ def delete_category_word(category, word)
96
+ @redis.hdel(category, word)
97
+ end
98
+
99
+ def word_in_category?(category, word)
100
+ @redis.hexists(category, word)
101
+ end
102
+
103
+ def reset
104
+ @redis.flushdb
105
+ @redis.set(:total_words, 0)
106
+ @redis.set(:total_trainings, 0)
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NoRedisError < RuntimeError
4
+ def initialize
5
+ msg =
6
+ %q(The Redis Backend can only be used if Redis is installed.
7
+ This error is raised from 'lib/classifier-reborn/backends/bayes_redis_backend.rb'.
8
+ If you have encountered this error and would like to use the Redis Backend,
9
+ please run 'gem install redis' or include 'gem "redis"' in
10
+ your gemfile. For more info see https://github.com/jekyll/classifier-reborn#usage.
11
+ )
12
+ super(msg)
13
+ end
14
+ end
@@ -1,8 +1,17 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
4
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
5
  # License:: LGPL
4
6
 
7
+ require 'set'
8
+
9
+ require_relative 'extensions/tokenizer/whitespace'
10
+ require_relative 'extensions/token_filter/stopword'
11
+ require_relative 'extensions/token_filter/stemmer'
5
12
  require_relative 'category_namer'
13
+ require_relative 'backends/bayes_memory_backend'
14
+ require_relative 'backends/bayes_redis_backend'
6
15
 
7
16
  module ClassifierReborn
8
17
  class Bayes
@@ -13,33 +22,46 @@ module ClassifierReborn
13
22
  # b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
14
23
  #
15
24
  # Options available are:
16
- # language: 'en' Used to select language specific stop words
17
- # auto_categorize: false When true, enables ability to dynamically declare a category
18
- # enable_threshold: false When true, enables a threshold requirement for classifition
19
- # threshold: 0.0 Default threshold, only used when enabled
25
+ # language: 'en' Used to select language specific stop words
26
+ # auto_categorize: false When true, enables ability to dynamically declare a category; the default is true if no initial categories are provided
27
+ # enable_threshold: false When true, enables a threshold requirement for classifition
28
+ # threshold: 0.0 Default threshold, only used when enabled
29
+ # enable_stemmer: true When false, disables word stemming
30
+ # stopwords: nil Accepts path to a text file or an array of words, when supplied, overwrites the default stopwords; assign empty string or array to disable stopwords
31
+ # backend: BayesMemoryBackend.new Alternatively, BayesRedisBackend.new for persistent storage
20
32
  def initialize(*args)
21
- @categories = Hash.new
22
- options = { language: 'en',
23
- auto_categorize: false,
33
+ @initial_categories = []
34
+ options = { language: 'en',
24
35
  enable_threshold: false,
25
- threshold: 0.0
26
- }
27
- args.flatten.each { |arg|
28
- if arg.kind_of?(Hash)
36
+ threshold: 0.0,
37
+ enable_stemmer: true,
38
+ backend: BayesMemoryBackend.new }
39
+ args.flatten.each do |arg|
40
+ if arg.is_a?(Hash)
29
41
  options.merge!(arg)
30
42
  else
31
- add_category(arg)
43
+ @initial_categories.push(arg)
32
44
  end
33
- }
45
+ end
34
46
 
35
- @total_words = 0
36
- @category_counts = Hash.new(0)
37
- @category_word_count = Hash.new(0)
47
+ unless options.key?(:auto_categorize)
48
+ options[:auto_categorize] = @initial_categories.empty? ? true : false
49
+ end
38
50
 
39
51
  @language = options[:language]
40
52
  @auto_categorize = options[:auto_categorize]
41
53
  @enable_threshold = options[:enable_threshold]
42
54
  @threshold = options[:threshold]
55
+ @enable_stemmer = options[:enable_stemmer]
56
+ @backend = options[:backend]
57
+ @tokenizer = options[:tokenizer] || Tokenizer::Whitespace
58
+ @token_filters = options[:token_filters] || [TokenFilter::Stopword]
59
+ @token_filters << TokenFilter::Stemmer if @enable_stemmer && !@token_filters.include?(TokenFilter::Stemmer)
60
+ TokenFilter::Stopword.language = @language if @token_filters.include?(TokenFilter::Stopword)
61
+
62
+ populate_initial_categories
63
+
64
+ custom_stopwords options[:stopwords] if options.key?(:stopwords)
43
65
  end
44
66
 
45
67
  # Provides a general training method for all categories specified in Bayes#new
@@ -49,23 +71,28 @@ module ClassifierReborn
49
71
  # b.train "that", "That text"
50
72
  # b.train "The other", "The other text"
51
73
  def train(category, text)
74
+ word_hash = Hasher.word_hash(text, @enable_stemmer,
75
+ tokenizer: @tokenizer, token_filters: @token_filters)
76
+ return if word_hash.empty?
77
+
52
78
  category = CategoryNamer.prepare_name(category)
53
79
 
54
80
  # Add the category dynamically or raise an error
55
- if !@categories.has_key?(category)
81
+ unless category_keys.include?(category)
56
82
  if @auto_categorize
57
83
  add_category(category)
58
84
  else
59
- raise CategoryNotFoundError.new("Cannot train; category #{category} does not exist")
85
+ raise CategoryNotFoundError, "Cannot train; category #{category} does not exist"
60
86
  end
61
87
  end
62
88
 
63
- @category_counts[category] += 1
64
- Hasher.word_hash(text, @language).each do |word, count|
65
- @categories[category][word] += count
66
- @category_word_count[category] += count
67
- @total_words += count
89
+ word_hash.each do |word, count|
90
+ @backend.update_category_word_frequency(category, word, count)
91
+ @backend.update_category_word_count(category, count)
92
+ @backend.update_total_words(count)
68
93
  end
94
+ @backend.update_total_trainings(1)
95
+ @backend.update_category_training_count(category, 1)
69
96
  end
70
97
 
71
98
  # Provides a untraining method for all categories specified in Bayes#new
@@ -76,23 +103,26 @@ module ClassifierReborn
76
103
  # b.train :this, "This text"
77
104
  # b.untrain :this, "This text"
78
105
  def untrain(category, text)
106
+ word_hash = Hasher.word_hash(text, @enable_stemmer,
107
+ tokenizer: @tokenizer, token_filters: @token_filters)
108
+ return if word_hash.empty?
109
+
79
110
  category = CategoryNamer.prepare_name(category)
80
- @category_counts[category] -= 1
81
- Hasher.word_hash(text, @language).each do |word, count|
82
- if @total_words >= 0
83
- orig = @categories[category][word] || 0
84
- @categories[category][word] -= count
85
- if @categories[category][word] <= 0
86
- @categories[category].delete(word)
87
- count = orig
88
- end
89
-
90
- if @category_word_count[category] >= count
91
- @category_word_count[category] -= count
92
- end
93
- @total_words -= count
111
+ word_hash.each do |word, count|
112
+ next if @backend.total_words < 0
113
+
114
+ orig = @backend.category_word_frequency(category, word) || 0
115
+ @backend.update_category_word_frequency(category, word, -count)
116
+ if @backend.category_word_frequency(category, word) <= 0
117
+ @backend.delete_category_word(category, word)
118
+ count = orig
94
119
  end
120
+
121
+ @backend.update_category_word_count(category, -count) if @backend.category_word_count(category) >= count
122
+ @backend.update_total_words(-count)
95
123
  end
124
+ @backend.update_total_trainings(-1)
125
+ @backend.update_category_training_count(category, -1)
96
126
  end
97
127
 
98
128
  # Returns the scores in each category the provided +text+. E.g.,
@@ -100,21 +130,27 @@ module ClassifierReborn
100
130
  # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
101
131
  # The largest of these scores (the one closest to 0) is the one picked out by #classify
102
132
  def classifications(text)
103
- score = Hash.new
104
- word_hash = Hasher.word_hash(text, @language)
105
- training_count = @category_counts.values.reduce(:+).to_f
106
- @categories.each do |category, category_words|
133
+ score = {}
134
+ word_hash = Hasher.word_hash(text, @enable_stemmer,
135
+ tokenizer: @tokenizer, token_filters: @token_filters)
136
+ if word_hash.empty?
137
+ category_keys.each do |category|
138
+ score[category.to_s] = Float::INFINITY
139
+ end
140
+ return score
141
+ end
142
+ category_keys.each do |category|
107
143
  score[category.to_s] = 0
108
- total = (@category_word_count[category] || 1).to_f
109
- word_hash.each do |word, count|
110
- s = category_words.has_key?(word) ? category_words[word] : 0.1
111
- score[category.to_s] += Math.log(s/total)
144
+ total = (@backend.category_word_count(category) || 1).to_f
145
+ word_hash.each do |word, _count|
146
+ s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1
147
+ score[category.to_s] += Math.log(s / total)
112
148
  end
113
149
  # now add prior probability for the category
114
- s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
115
- score[category.to_s] += Math.log(s / training_count)
150
+ s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1
151
+ score[category.to_s] += Math.log(s / @backend.total_trainings.to_f)
116
152
  end
117
- return score
153
+ score
118
154
  end
119
155
 
120
156
  # Returns the classification of the provided +text+, which is one of the
@@ -128,21 +164,15 @@ module ClassifierReborn
128
164
  # Return the classification without the score
129
165
  def classify(text)
130
166
  result, score = classify_with_score(text)
131
- if threshold_enabled?
132
- result = nil if score < @threshold || score == Float::INFINITY
133
- end
134
- return result
167
+ result = nil if threshold_enabled? && (score < @threshold || score == Float::INFINITY)
168
+ result
135
169
  end
136
170
 
137
171
  # Retrieve the current threshold value
138
- def threshold
139
- @threshold
140
- end
172
+ attr_reader :threshold
141
173
 
142
174
  # Dynamically set the threshold value
143
- def threshold=(a_float)
144
- @threshold = a_float
145
- end
175
+ attr_writer :threshold
146
176
 
147
177
  # Dynamically enable threshold for classify results
148
178
  def enable_threshold
@@ -164,6 +194,16 @@ module ClassifierReborn
164
194
  !@enable_threshold
165
195
  end
166
196
 
197
+ # Is word stemming enabled?
198
+ def stemmer_enabled?
199
+ @enable_stemmer
200
+ end
201
+
202
+ # Is word stemming disabled?
203
+ def stemmer_disabled?
204
+ !@enable_stemmer
205
+ end
206
+
167
207
  # Provides training and untraining methods for the categories specified in Bayes#new
168
208
  # For example:
169
209
  # b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
@@ -174,21 +214,29 @@ module ClassifierReborn
174
214
  def method_missing(name, *args)
175
215
  cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
176
216
  category = CategoryNamer.prepare_name(cleaned_name)
177
- if @categories.has_key? category
178
- args.each { |text| eval("#{$1}train(category, text)") }
217
+ if category_keys.include?(category)
218
+ args.each { |text| eval("#{Regexp.last_match(1)}train(category, text)") }
179
219
  elsif name.to_s =~ /(un)?train_([\w]+)/
180
220
  raise StandardError, "No such category: #{category}"
181
221
  else
182
- super #raise StandardError, "No such method: #{name}"
222
+ super # raise StandardError, "No such method: #{name}"
183
223
  end
184
224
  end
185
225
 
186
226
  # Provides a list of category names
187
227
  # For example:
188
228
  # b.categories
189
- # => ['This', 'That', 'the_other']
190
- def categories # :nodoc:
191
- @categories.keys.collect {|c| c.to_s}
229
+ # => ["This", "That", "The other"]
230
+ def categories
231
+ category_keys.collect(&:to_s)
232
+ end
233
+
234
+ # Provides a list of category keys as symbols
235
+ # For example:
236
+ # b.categories
237
+ # => [:This, :That, :"The other"]
238
+ def category_keys
239
+ @backend.category_keys
192
240
  end
193
241
 
194
242
  # Allows you to add categories to the classifier.
@@ -200,9 +248,37 @@ module ClassifierReborn
200
248
  # more criteria than the trained selective categories. In short,
201
249
  # try to initialize your categories at initialization.
202
250
  def add_category(category)
203
- @categories[CategoryNamer.prepare_name(category)] ||= Hash.new(0)
251
+ category = CategoryNamer.prepare_name(category)
252
+ @backend.add_category(category)
204
253
  end
205
254
 
206
255
  alias append_category add_category
256
+
257
+ def reset
258
+ @backend.reset
259
+ populate_initial_categories
260
+ end
261
+
262
+ private
263
+
264
+ def populate_initial_categories
265
+ @initial_categories.each do |c|
266
+ add_category(c)
267
+ end
268
+ end
269
+
270
+ # Overwrites the default stopwords for current language with supplied list of stopwords or file
271
+ def custom_stopwords(stopwords)
272
+ unless stopwords.is_a?(Enumerable)
273
+ if stopwords.strip.empty?
274
+ stopwords = []
275
+ elsif File.exist?(stopwords)
276
+ stopwords = File.read(stopwords).force_encoding('utf-8').split
277
+ else
278
+ return # Do not overwrite the default
279
+ end
280
+ end
281
+ TokenFilter::Stopword::STOPWORDS[@language] = Set.new stopwords
282
+ end
207
283
  end
208
284
  end
@@ -1,17 +1,19 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
4
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
5
  # License:: LGPL
4
6
 
5
- require 'fast_stemmer'
6
7
  require 'classifier-reborn/extensions/hasher'
7
8
 
8
9
  module ClassifierReborn
9
10
  module CategoryNamer
10
- extend self
11
- def prepare_name(name)
11
+ module_function
12
+
13
+ def prepare_name(name)
12
14
  return name if name.is_a?(Symbol)
13
15
 
14
- name.to_s.gsub("_"," ").capitalize.intern
16
+ name.to_s.tr('_', ' ').capitalize.intern
15
17
  end
16
18
  end
17
19
  end
@@ -1,59 +1,42 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
3
4
  # Copyright:: Copyright (c) 2005 Lucas Carlson
4
5
  # License:: LGPL
5
6
 
6
7
  require 'set'
7
8
 
9
+ require_relative 'tokenizer/whitespace'
10
+ require_relative 'token_filter/stopword'
11
+ require_relative 'token_filter/stemmer'
12
+
8
13
  module ClassifierReborn
9
14
  module Hasher
10
- STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
11
-
12
- extend self
15
+ module_function
13
16
 
14
17
  # Return a Hash of strings => ints. Each word in the string is stemmed,
15
18
  # interned, and indexes to its frequency in the document.
16
- def word_hash(str, language = 'en')
17
- cleaned_word_hash = clean_word_hash(str, language)
18
- symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
19
- return cleaned_word_hash.merge(symbol_hash)
20
- end
21
-
22
- # Return a word hash without extra punctuation or short symbols, just stemmed words
23
- def clean_word_hash(str, language = 'en')
24
- word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
25
- end
26
-
27
- def word_hash_for_words(words, language = 'en')
28
- d = Hash.new(0)
29
- words.each do |word|
30
- if word.length > 2 && !STOPWORDS[language].include?(word)
31
- d[word.stem.intern] += 1
19
+ def word_hash(str, enable_stemmer = true,
20
+ tokenizer: Tokenizer::Whitespace,
21
+ token_filters: [TokenFilter::Stopword])
22
+ if token_filters.include?(TokenFilter::Stemmer)
23
+ unless enable_stemmer
24
+ token_filters.reject! do |token_filter|
25
+ token_filter == TokenFilter::Stemmer
26
+ end
32
27
  end
28
+ else
29
+ token_filters << TokenFilter::Stemmer if enable_stemmer
30
+ end
31
+ words = tokenizer.call(str)
32
+ token_filters.each do |token_filter|
33
+ words = token_filter.call(words)
33
34
  end
34
- return d
35
- end
36
-
37
- def word_hash_for_symbols(words)
38
35
  d = Hash.new(0)
39
36
  words.each do |word|
40
37
  d[word.intern] += 1
41
38
  end
42
- return d
43
- end
44
-
45
- # Create a lazily-loaded hash of stopword data
46
- STOPWORDS = Hash.new do |hash, language|
47
- hash[language] = []
48
-
49
- STOPWORDS_PATH.each do |path|
50
- if File.exist?(File.join(path, language))
51
- hash[language] = Set.new File.read(File.join(path, language.to_s)).split
52
- break
53
- end
54
- end
55
-
56
- hash[language]
39
+ d
57
40
  end
58
41
  end
59
42
  end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
4
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
5
+ # License:: LGPL
6
+
7
+ module ClassifierReborn
8
+ module TokenFilter
9
+ # This filter converts given tokens to their stemmed versions.
10
+ module Stemmer
11
+ module_function
12
+
13
+ def call(tokens)
14
+ tokens.collect do |token|
15
+ if token.stemmable?
16
+ token.stem
17
+ else
18
+ token
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end