classifier-reborn 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,125 @@
1
+
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+ 得 打
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+ 使
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+ 沿
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
@@ -25,6 +25,15 @@
25
25
  # License:: LGPL
26
26
 
27
27
  require 'rubygems'
28
+
29
+ case RUBY_PLATFORM
30
+ when 'java'
31
+ require 'jruby-stemmer'
32
+ else
33
+ require 'fast-stemmer'
34
+ end
35
+
28
36
  require_relative 'classifier-reborn/category_namer'
29
37
  require_relative 'classifier-reborn/bayes'
30
38
  require_relative 'classifier-reborn/lsi'
39
+ require_relative 'classifier-reborn/validators/classifier_validator'
@@ -0,0 +1,75 @@
1
+ module ClassifierReborn
2
+ class BayesMemoryBackend
3
+ attr_reader :total_words, :total_trainings
4
+
5
+ # This class provides Memory as the storage backend for the classifier data structures
6
+ def initialize
7
+ @total_words = 0
8
+ @total_trainings = 0
9
+ @category_counts = {}
10
+ @categories = {}
11
+ end
12
+
13
+ def update_total_words(diff)
14
+ @total_words += diff
15
+ end
16
+
17
+ def update_total_trainings(diff)
18
+ @total_trainings += diff
19
+ end
20
+
21
+ def category_training_count(category)
22
+ category_counts(category)[:training]
23
+ end
24
+
25
+ def update_category_training_count(category, diff)
26
+ category_counts(category)[:training] += diff
27
+ end
28
+
29
+ def category_has_trainings?(category)
30
+ @category_counts.key?(category) && category_training_count(category) > 0
31
+ end
32
+
33
+ def category_word_count(category)
34
+ category_counts(category)[:word]
35
+ end
36
+
37
+ def update_category_word_count(category, diff)
38
+ category_counts(category)[:word] += diff
39
+ end
40
+
41
+ def add_category(category)
42
+ @categories[category] ||= Hash.new(0)
43
+ end
44
+
45
+ def category_keys
46
+ @categories.keys
47
+ end
48
+
49
+ def category_word_frequency(category, word)
50
+ @categories[category][word]
51
+ end
52
+
53
+ def update_category_word_frequency(category, word, diff)
54
+ @categories[category][word] += diff
55
+ end
56
+
57
+ def delete_category_word(category, word)
58
+ @categories[category].delete(word)
59
+ end
60
+
61
+ def word_in_category?(category, word)
62
+ @categories[category].key?(word)
63
+ end
64
+
65
+ def reset
66
+ initialize
67
+ end
68
+
69
+ private
70
+
71
+ def category_counts(category)
72
+ @category_counts[category] ||= {training: 0, word: 0}
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,107 @@
1
+ require_relative 'no_redis_error'
2
+ # require redis when we run #intialize. This way only people using this backend
3
+ # will need to install and load the backend without having to
4
+ # require 'classifier-reborn/backends/bayes_redis_backend'
5
+
6
+ module ClassifierReborn
7
+ # This class provides Redis as the storage backend for the classifier data structures
8
+ class BayesRedisBackend
9
+ # The class can be created with the same arguments that the redis gem accepts
10
+ # E.g.,
11
+ # b = ClassifierReborn::BayesRedisBackend.new
12
+ # b = ClassifierReborn::BayesRedisBackend.new host: "10.0.1.1", port: 6380, db: 2
13
+ # b = ClassifierReborn::BayesRedisBackend.new url: "redis://:secret@10.0.1.1:6380/2"
14
+ #
15
+ # Options available are:
16
+ # url: lambda { ENV["REDIS_URL"] }
17
+ # scheme: "redis"
18
+ # host: "127.0.0.1"
19
+ # port: 6379
20
+ # path: nil
21
+ # timeout: 5.0
22
+ # password: nil
23
+ # db: 0
24
+ # driver: nil
25
+ # id: nil
26
+ # tcp_keepalive: 0
27
+ # reconnect_attempts: 1
28
+ # inherit_socket: false
29
+ def initialize(options = {})
30
+ begin # because some people don't have redis installed
31
+ require 'redis'
32
+ rescue LoadError
33
+ raise NoRedisError
34
+ end
35
+
36
+ @redis = Redis.new(options)
37
+ @redis.setnx(:total_words, 0)
38
+ @redis.setnx(:total_trainings, 0)
39
+ end
40
+
41
+ def total_words
42
+ @redis.get(:total_words).to_i
43
+ end
44
+
45
+ def update_total_words(diff)
46
+ @redis.incrby(:total_words, diff)
47
+ end
48
+
49
+ def total_trainings
50
+ @redis.get(:total_trainings).to_i
51
+ end
52
+
53
+ def update_total_trainings(diff)
54
+ @redis.incrby(:total_trainings, diff)
55
+ end
56
+
57
+ def category_training_count(category)
58
+ @redis.hget(:category_training_count, category).to_i
59
+ end
60
+
61
+ def update_category_training_count(category, diff)
62
+ @redis.hincrby(:category_training_count, category, diff)
63
+ end
64
+
65
+ def category_has_trainings?(category)
66
+ category_training_count(category) > 0
67
+ end
68
+
69
+ def category_word_count(category)
70
+ @redis.hget(:category_word_count, category).to_i
71
+ end
72
+
73
+ def update_category_word_count(category, diff)
74
+ @redis.hincrby(:category_word_count, category, diff)
75
+ end
76
+
77
+ def add_category(category)
78
+ @redis.sadd(:category_keys, category)
79
+ end
80
+
81
+ def category_keys
82
+ @redis.smembers(:category_keys).map(&:intern)
83
+ end
84
+
85
+ def category_word_frequency(category, word)
86
+ @redis.hget(category, word).to_i
87
+ end
88
+
89
+ def update_category_word_frequency(category, word, diff)
90
+ @redis.hincrby(category, word, diff)
91
+ end
92
+
93
+ def delete_category_word(category, word)
94
+ @redis.hdel(category, word)
95
+ end
96
+
97
+ def word_in_category?(category, word)
98
+ @redis.hexists(category, word)
99
+ end
100
+
101
+ def reset
102
+ @redis.flushdb
103
+ @redis.set(:total_words, 0)
104
+ @redis.set(:total_trainings, 0)
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,12 @@
1
+ class NoRedisError < LoadError
2
+ def initialize
3
+ msg =
4
+ %q{The Redis Backend can only be used if Redis is installed.
5
+ This error is raised from 'lib/classifier-reborn/backends/bayes_redis_backend.rb'.
6
+ If you have encountered this error and would like to use the Redis Backend,
7
+ please run 'gem install redis' or include 'gem "redis"' in
8
+ your gemfile. For more info see https://github.com/jekyll/classifier-reborn#usage.
9
+ }
10
+ super(msg)
11
+ end
12
+ end
@@ -2,7 +2,11 @@
2
2
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
3
  # License:: LGPL
4
4
 
5
+ require 'set'
6
+
5
7
  require_relative 'category_namer'
8
+ require_relative 'backends/bayes_memory_backend'
9
+ require_relative 'backends/bayes_redis_backend'
6
10
 
7
11
  module ClassifierReborn
8
12
  class Bayes
@@ -13,36 +17,45 @@ module ClassifierReborn
13
17
  # b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
14
18
  #
15
19
  # Options available are:
16
- # language: 'en' Used to select language specific stop words
17
- # auto_categorize: false When true, enables ability to dynamically declare a category
18
- # enable_threshold: false When true, enables a threshold requirement for classifition
19
- # threshold: 0.0 Default threshold, only used when enabled
20
- # enable_stemmer: true When false, disables word stemming
20
+ # language: 'en' Used to select language specific stop words
21
+ # auto_categorize: false When true, enables ability to dynamically declare a category; the default is true if no initial categories are provided
22
+ # enable_threshold: false When true, enables a threshold requirement for classifition
23
+ # threshold: 0.0 Default threshold, only used when enabled
24
+ # enable_stemmer: true When false, disables word stemming
25
+ # stopwords: nil Accepts path to a text file or an array of words, when supplied, overwrites the default stopwords; assign empty string or array to disable stopwords
26
+ # backend: BayesMemoryBackend.new Alternatively, BayesRedisBackend.new for persistent storage
21
27
  def initialize(*args)
22
- @categories = {}
28
+ @initial_categories = []
23
29
  options = { language: 'en',
24
- auto_categorize: false,
25
30
  enable_threshold: false,
26
31
  threshold: 0.0,
27
- enable_stemmer: true
32
+ enable_stemmer: true,
33
+ backend: BayesMemoryBackend.new
28
34
  }
29
35
  args.flatten.each do |arg|
30
36
  if arg.is_a?(Hash)
31
37
  options.merge!(arg)
32
38
  else
33
- add_category(arg)
39
+ @initial_categories.push(arg)
34
40
  end
35
41
  end
36
42
 
37
- @total_words = 0
38
- @category_counts = Hash.new(0)
39
- @category_word_count = Hash.new(0)
43
+ unless options.key?(:auto_categorize)
44
+ options[:auto_categorize] = @initial_categories.empty? ? true : false
45
+ end
40
46
 
41
47
  @language = options[:language]
42
48
  @auto_categorize = options[:auto_categorize]
43
49
  @enable_threshold = options[:enable_threshold]
44
50
  @threshold = options[:threshold]
45
51
  @enable_stemmer = options[:enable_stemmer]
52
+ @backend = options[:backend]
53
+
54
+ populate_initial_categories
55
+
56
+ if options.key?(:stopwords)
57
+ custom_stopwords options[:stopwords]
58
+ end
46
59
  end
47
60
 
48
61
  # Provides a general training method for all categories specified in Bayes#new
@@ -52,10 +65,12 @@ module ClassifierReborn
52
65
  # b.train "that", "That text"
53
66
  # b.train "The other", "The other text"
54
67
  def train(category, text)
68
+ word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
69
+ return if word_hash.empty?
55
70
  category = CategoryNamer.prepare_name(category)
56
71
 
57
72
  # Add the category dynamically or raise an error
58
- unless @categories.key?(category)
73
+ unless category_keys.include?(category)
59
74
  if @auto_categorize
60
75
  add_category(category)
61
76
  else
@@ -63,12 +78,13 @@ module ClassifierReborn
63
78
  end
64
79
  end
65
80
 
66
- @category_counts[category] += 1
67
- Hasher.word_hash(text, @language, @enable_stemmer).each do |word, count|
68
- @categories[category][word] += count
69
- @category_word_count[category] += count
70
- @total_words += count
81
+ word_hash.each do |word, count|
82
+ @backend.update_category_word_frequency(category, word, count)
83
+ @backend.update_category_word_count(category, count)
84
+ @backend.update_total_words(count)
71
85
  end
86
+ @backend.update_total_trainings(1)
87
+ @backend.update_category_training_count(category, 1)
72
88
  end
73
89
 
74
90
  # Provides a untraining method for all categories specified in Bayes#new
@@ -79,20 +95,23 @@ module ClassifierReborn
79
95
  # b.train :this, "This text"
80
96
  # b.untrain :this, "This text"
81
97
  def untrain(category, text)
98
+ word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
99
+ return if word_hash.empty?
82
100
  category = CategoryNamer.prepare_name(category)
83
- @category_counts[category] -= 1
84
- Hasher.word_hash(text, @language, @enable_stemmer).each do |word, count|
85
- next if @total_words < 0
86
- orig = @categories[category][word] || 0
87
- @categories[category][word] -= count
88
- if @categories[category][word] <= 0
89
- @categories[category].delete(word)
101
+ word_hash.each do |word, count|
102
+ next if @backend.total_words < 0
103
+ orig = @backend.category_word_frequency(category, word) || 0
104
+ @backend.update_category_word_frequency(category, word, -count)
105
+ if @backend.category_word_frequency(category, word) <= 0
106
+ @backend.delete_category_word(category, word)
90
107
  count = orig
91
108
  end
92
109
 
93
- @category_word_count[category] -= count if @category_word_count[category] >= count
94
- @total_words -= count
110
+ @backend.update_category_word_count(category, -count) if @backend.category_word_count(category) >= count
111
+ @backend.update_total_words(-count)
95
112
  end
113
+ @backend.update_total_trainings(-1)
114
+ @backend.update_category_training_count(category, -1)
96
115
  end
97
116
 
98
117
  # Returns the scores in each category the provided +text+. E.g.,
@@ -102,17 +121,22 @@ module ClassifierReborn
102
121
  def classifications(text)
103
122
  score = {}
104
123
  word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
105
- training_count = @category_counts.values.reduce(:+).to_f
106
- @categories.each do |category, category_words|
124
+ if word_hash.empty?
125
+ category_keys.each do |category|
126
+ score[category.to_s] = Float::INFINITY
127
+ end
128
+ return score
129
+ end
130
+ category_keys.each do |category|
107
131
  score[category.to_s] = 0
108
- total = (@category_word_count[category] || 1).to_f
132
+ total = (@backend.category_word_count(category) || 1).to_f
109
133
  word_hash.each do |word, _count|
110
- s = category_words.key?(word) ? category_words[word] : 0.1
134
+ s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1
111
135
  score[category.to_s] += Math.log(s / total)
112
136
  end
113
137
  # now add prior probability for the category
114
- s = @category_counts.key?(category) ? @category_counts[category] : 0.1
115
- score[category.to_s] += Math.log(s / training_count)
138
+ s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1
139
+ score[category.to_s] += Math.log(s / @backend.total_trainings.to_f)
116
140
  end
117
141
  score
118
142
  end
@@ -178,7 +202,7 @@ module ClassifierReborn
178
202
  def method_missing(name, *args)
179
203
  cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
180
204
  category = CategoryNamer.prepare_name(cleaned_name)
181
- if @categories.key? category
205
+ if category_keys.include?(category)
182
206
  args.each { |text| eval("#{Regexp.last_match(1)}train(category, text)") }
183
207
  elsif name.to_s =~ /(un)?train_([\w]+)/
184
208
  raise StandardError, "No such category: #{category}"
@@ -190,9 +214,17 @@ module ClassifierReborn
190
214
  # Provides a list of category names
191
215
  # For example:
192
216
  # b.categories
193
- # => ['This', 'That', 'the_other']
194
- def categories # :nodoc:
195
- @categories.keys.collect(&:to_s)
217
+ # => ["This", "That", "The other"]
218
+ def categories
219
+ category_keys.collect(&:to_s)
220
+ end
221
+
222
+ # Provides a list of category keys as symbols
223
+ # For example:
224
+ # b.categories
225
+ # => [:This, :That, :"The other"]
226
+ def category_keys
227
+ @backend.category_keys
196
228
  end
197
229
 
198
230
  # Allows you to add categories to the classifier.
@@ -204,9 +236,37 @@ module ClassifierReborn
204
236
  # more criteria than the trained selective categories. In short,
205
237
  # try to initialize your categories at initialization.
206
238
  def add_category(category)
207
- @categories[CategoryNamer.prepare_name(category)] ||= Hash.new(0)
239
+ category = CategoryNamer.prepare_name(category)
240
+ @backend.add_category(category)
208
241
  end
209
242
 
210
243
  alias_method :append_category, :add_category
244
+
245
+ def reset
246
+ @backend.reset
247
+ populate_initial_categories
248
+ end
249
+
250
+ private
251
+
252
+ def populate_initial_categories
253
+ @initial_categories.each do |c|
254
+ add_category(c)
255
+ end
256
+ end
257
+
258
+ # Overwrites the default stopwords for current language with supplied list of stopwords or file
259
+ def custom_stopwords(stopwords)
260
+ unless stopwords.is_a?(Enumerable)
261
+ if stopwords.strip.empty?
262
+ stopwords = []
263
+ elsif File.exist?(stopwords)
264
+ stopwords = File.read(stopwords).force_encoding("utf-8").split
265
+ else
266
+ return # Do not overwrite the default
267
+ end
268
+ end
269
+ Hasher::STOPWORDS[@language] = Set.new stopwords
270
+ end
211
271
  end
212
272
  end