classifier-reborn 2.1.0 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,125 @@
1
+
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+ 得 打
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+ 使
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+ 沿
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
@@ -25,6 +25,15 @@
25
25
  # License:: LGPL
26
26
 
27
27
  require 'rubygems'
28
+
29
+ case RUBY_PLATFORM
30
+ when 'java'
31
+ require 'jruby-stemmer'
32
+ else
33
+ require 'fast-stemmer'
34
+ end
35
+
28
36
  require_relative 'classifier-reborn/category_namer'
29
37
  require_relative 'classifier-reborn/bayes'
30
38
  require_relative 'classifier-reborn/lsi'
39
+ require_relative 'classifier-reborn/validators/classifier_validator'
@@ -0,0 +1,75 @@
1
+ module ClassifierReborn
2
+ class BayesMemoryBackend
3
+ attr_reader :total_words, :total_trainings
4
+
5
+ # This class provides Memory as the storage backend for the classifier data structures
6
+ def initialize
7
+ @total_words = 0
8
+ @total_trainings = 0
9
+ @category_counts = {}
10
+ @categories = {}
11
+ end
12
+
13
+ def update_total_words(diff)
14
+ @total_words += diff
15
+ end
16
+
17
+ def update_total_trainings(diff)
18
+ @total_trainings += diff
19
+ end
20
+
21
+ def category_training_count(category)
22
+ category_counts(category)[:training]
23
+ end
24
+
25
+ def update_category_training_count(category, diff)
26
+ category_counts(category)[:training] += diff
27
+ end
28
+
29
+ def category_has_trainings?(category)
30
+ @category_counts.key?(category) && category_training_count(category) > 0
31
+ end
32
+
33
+ def category_word_count(category)
34
+ category_counts(category)[:word]
35
+ end
36
+
37
+ def update_category_word_count(category, diff)
38
+ category_counts(category)[:word] += diff
39
+ end
40
+
41
+ def add_category(category)
42
+ @categories[category] ||= Hash.new(0)
43
+ end
44
+
45
+ def category_keys
46
+ @categories.keys
47
+ end
48
+
49
+ def category_word_frequency(category, word)
50
+ @categories[category][word]
51
+ end
52
+
53
+ def update_category_word_frequency(category, word, diff)
54
+ @categories[category][word] += diff
55
+ end
56
+
57
+ def delete_category_word(category, word)
58
+ @categories[category].delete(word)
59
+ end
60
+
61
+ def word_in_category?(category, word)
62
+ @categories[category].key?(word)
63
+ end
64
+
65
+ def reset
66
+ initialize
67
+ end
68
+
69
+ private
70
+
71
+ def category_counts(category)
72
+ @category_counts[category] ||= {training: 0, word: 0}
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,107 @@
1
+ require_relative 'no_redis_error'
2
+ # require redis when we run #intialize. This way only people using this backend
3
+ # will need to install and load the backend without having to
4
+ # require 'classifier-reborn/backends/bayes_redis_backend'
5
+
6
+ module ClassifierReborn
7
+ # This class provides Redis as the storage backend for the classifier data structures
8
+ class BayesRedisBackend
9
+ # The class can be created with the same arguments that the redis gem accepts
10
+ # E.g.,
11
+ # b = ClassifierReborn::BayesRedisBackend.new
12
+ # b = ClassifierReborn::BayesRedisBackend.new host: "10.0.1.1", port: 6380, db: 2
13
+ # b = ClassifierReborn::BayesRedisBackend.new url: "redis://:secret@10.0.1.1:6380/2"
14
+ #
15
+ # Options available are:
16
+ # url: lambda { ENV["REDIS_URL"] }
17
+ # scheme: "redis"
18
+ # host: "127.0.0.1"
19
+ # port: 6379
20
+ # path: nil
21
+ # timeout: 5.0
22
+ # password: nil
23
+ # db: 0
24
+ # driver: nil
25
+ # id: nil
26
+ # tcp_keepalive: 0
27
+ # reconnect_attempts: 1
28
+ # inherit_socket: false
29
+ def initialize(options = {})
30
+ begin # because some people don't have redis installed
31
+ require 'redis'
32
+ rescue LoadError
33
+ raise NoRedisError
34
+ end
35
+
36
+ @redis = Redis.new(options)
37
+ @redis.setnx(:total_words, 0)
38
+ @redis.setnx(:total_trainings, 0)
39
+ end
40
+
41
+ def total_words
42
+ @redis.get(:total_words).to_i
43
+ end
44
+
45
+ def update_total_words(diff)
46
+ @redis.incrby(:total_words, diff)
47
+ end
48
+
49
+ def total_trainings
50
+ @redis.get(:total_trainings).to_i
51
+ end
52
+
53
+ def update_total_trainings(diff)
54
+ @redis.incrby(:total_trainings, diff)
55
+ end
56
+
57
+ def category_training_count(category)
58
+ @redis.hget(:category_training_count, category).to_i
59
+ end
60
+
61
+ def update_category_training_count(category, diff)
62
+ @redis.hincrby(:category_training_count, category, diff)
63
+ end
64
+
65
+ def category_has_trainings?(category)
66
+ category_training_count(category) > 0
67
+ end
68
+
69
+ def category_word_count(category)
70
+ @redis.hget(:category_word_count, category).to_i
71
+ end
72
+
73
+ def update_category_word_count(category, diff)
74
+ @redis.hincrby(:category_word_count, category, diff)
75
+ end
76
+
77
+ def add_category(category)
78
+ @redis.sadd(:category_keys, category)
79
+ end
80
+
81
+ def category_keys
82
+ @redis.smembers(:category_keys).map(&:intern)
83
+ end
84
+
85
+ def category_word_frequency(category, word)
86
+ @redis.hget(category, word).to_i
87
+ end
88
+
89
+ def update_category_word_frequency(category, word, diff)
90
+ @redis.hincrby(category, word, diff)
91
+ end
92
+
93
+ def delete_category_word(category, word)
94
+ @redis.hdel(category, word)
95
+ end
96
+
97
+ def word_in_category?(category, word)
98
+ @redis.hexists(category, word)
99
+ end
100
+
101
+ def reset
102
+ @redis.flushdb
103
+ @redis.set(:total_words, 0)
104
+ @redis.set(:total_trainings, 0)
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,12 @@
1
+ class NoRedisError < LoadError
2
+ def initialize
3
+ msg =
4
+ %q{The Redis Backend can only be used if Redis is installed.
5
+ This error is raised from 'lib/classifier-reborn/backends/bayes_redis_backend.rb'.
6
+ If you have encountered this error and would like to use the Redis Backend,
7
+ please run 'gem install redis' or include 'gem "redis"' in
8
+ your gemfile. For more info see https://github.com/jekyll/classifier-reborn#usage.
9
+ }
10
+ super(msg)
11
+ end
12
+ end
@@ -2,7 +2,11 @@
2
2
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
3
  # License:: LGPL
4
4
 
5
+ require 'set'
6
+
5
7
  require_relative 'category_namer'
8
+ require_relative 'backends/bayes_memory_backend'
9
+ require_relative 'backends/bayes_redis_backend'
6
10
 
7
11
  module ClassifierReborn
8
12
  class Bayes
@@ -13,36 +17,45 @@ module ClassifierReborn
13
17
  # b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
14
18
  #
15
19
  # Options available are:
16
- # language: 'en' Used to select language specific stop words
17
- # auto_categorize: false When true, enables ability to dynamically declare a category
18
- # enable_threshold: false When true, enables a threshold requirement for classifition
19
- # threshold: 0.0 Default threshold, only used when enabled
20
- # enable_stemmer: true When false, disables word stemming
20
+ # language: 'en' Used to select language specific stop words
21
+ # auto_categorize: false When true, enables ability to dynamically declare a category; the default is true if no initial categories are provided
22
+ # enable_threshold: false When true, enables a threshold requirement for classifition
23
+ # threshold: 0.0 Default threshold, only used when enabled
24
+ # enable_stemmer: true When false, disables word stemming
25
+ # stopwords: nil Accepts path to a text file or an array of words, when supplied, overwrites the default stopwords; assign empty string or array to disable stopwords
26
+ # backend: BayesMemoryBackend.new Alternatively, BayesRedisBackend.new for persistent storage
21
27
  def initialize(*args)
22
- @categories = {}
28
+ @initial_categories = []
23
29
  options = { language: 'en',
24
- auto_categorize: false,
25
30
  enable_threshold: false,
26
31
  threshold: 0.0,
27
- enable_stemmer: true
32
+ enable_stemmer: true,
33
+ backend: BayesMemoryBackend.new
28
34
  }
29
35
  args.flatten.each do |arg|
30
36
  if arg.is_a?(Hash)
31
37
  options.merge!(arg)
32
38
  else
33
- add_category(arg)
39
+ @initial_categories.push(arg)
34
40
  end
35
41
  end
36
42
 
37
- @total_words = 0
38
- @category_counts = Hash.new(0)
39
- @category_word_count = Hash.new(0)
43
+ unless options.key?(:auto_categorize)
44
+ options[:auto_categorize] = @initial_categories.empty? ? true : false
45
+ end
40
46
 
41
47
  @language = options[:language]
42
48
  @auto_categorize = options[:auto_categorize]
43
49
  @enable_threshold = options[:enable_threshold]
44
50
  @threshold = options[:threshold]
45
51
  @enable_stemmer = options[:enable_stemmer]
52
+ @backend = options[:backend]
53
+
54
+ populate_initial_categories
55
+
56
+ if options.key?(:stopwords)
57
+ custom_stopwords options[:stopwords]
58
+ end
46
59
  end
47
60
 
48
61
  # Provides a general training method for all categories specified in Bayes#new
@@ -52,10 +65,12 @@ module ClassifierReborn
52
65
  # b.train "that", "That text"
53
66
  # b.train "The other", "The other text"
54
67
  def train(category, text)
68
+ word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
69
+ return if word_hash.empty?
55
70
  category = CategoryNamer.prepare_name(category)
56
71
 
57
72
  # Add the category dynamically or raise an error
58
- unless @categories.key?(category)
73
+ unless category_keys.include?(category)
59
74
  if @auto_categorize
60
75
  add_category(category)
61
76
  else
@@ -63,12 +78,13 @@ module ClassifierReborn
63
78
  end
64
79
  end
65
80
 
66
- @category_counts[category] += 1
67
- Hasher.word_hash(text, @language, @enable_stemmer).each do |word, count|
68
- @categories[category][word] += count
69
- @category_word_count[category] += count
70
- @total_words += count
81
+ word_hash.each do |word, count|
82
+ @backend.update_category_word_frequency(category, word, count)
83
+ @backend.update_category_word_count(category, count)
84
+ @backend.update_total_words(count)
71
85
  end
86
+ @backend.update_total_trainings(1)
87
+ @backend.update_category_training_count(category, 1)
72
88
  end
73
89
 
74
90
  # Provides a untraining method for all categories specified in Bayes#new
@@ -79,20 +95,23 @@ module ClassifierReborn
79
95
  # b.train :this, "This text"
80
96
  # b.untrain :this, "This text"
81
97
  def untrain(category, text)
98
+ word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
99
+ return if word_hash.empty?
82
100
  category = CategoryNamer.prepare_name(category)
83
- @category_counts[category] -= 1
84
- Hasher.word_hash(text, @language, @enable_stemmer).each do |word, count|
85
- next if @total_words < 0
86
- orig = @categories[category][word] || 0
87
- @categories[category][word] -= count
88
- if @categories[category][word] <= 0
89
- @categories[category].delete(word)
101
+ word_hash.each do |word, count|
102
+ next if @backend.total_words < 0
103
+ orig = @backend.category_word_frequency(category, word) || 0
104
+ @backend.update_category_word_frequency(category, word, -count)
105
+ if @backend.category_word_frequency(category, word) <= 0
106
+ @backend.delete_category_word(category, word)
90
107
  count = orig
91
108
  end
92
109
 
93
- @category_word_count[category] -= count if @category_word_count[category] >= count
94
- @total_words -= count
110
+ @backend.update_category_word_count(category, -count) if @backend.category_word_count(category) >= count
111
+ @backend.update_total_words(-count)
95
112
  end
113
+ @backend.update_total_trainings(-1)
114
+ @backend.update_category_training_count(category, -1)
96
115
  end
97
116
 
98
117
  # Returns the scores in each category the provided +text+. E.g.,
@@ -102,17 +121,22 @@ module ClassifierReborn
102
121
  def classifications(text)
103
122
  score = {}
104
123
  word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
105
- training_count = @category_counts.values.reduce(:+).to_f
106
- @categories.each do |category, category_words|
124
+ if word_hash.empty?
125
+ category_keys.each do |category|
126
+ score[category.to_s] = Float::INFINITY
127
+ end
128
+ return score
129
+ end
130
+ category_keys.each do |category|
107
131
  score[category.to_s] = 0
108
- total = (@category_word_count[category] || 1).to_f
132
+ total = (@backend.category_word_count(category) || 1).to_f
109
133
  word_hash.each do |word, _count|
110
- s = category_words.key?(word) ? category_words[word] : 0.1
134
+ s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1
111
135
  score[category.to_s] += Math.log(s / total)
112
136
  end
113
137
  # now add prior probability for the category
114
- s = @category_counts.key?(category) ? @category_counts[category] : 0.1
115
- score[category.to_s] += Math.log(s / training_count)
138
+ s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1
139
+ score[category.to_s] += Math.log(s / @backend.total_trainings.to_f)
116
140
  end
117
141
  score
118
142
  end
@@ -178,7 +202,7 @@ module ClassifierReborn
178
202
  def method_missing(name, *args)
179
203
  cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
180
204
  category = CategoryNamer.prepare_name(cleaned_name)
181
- if @categories.key? category
205
+ if category_keys.include?(category)
182
206
  args.each { |text| eval("#{Regexp.last_match(1)}train(category, text)") }
183
207
  elsif name.to_s =~ /(un)?train_([\w]+)/
184
208
  raise StandardError, "No such category: #{category}"
@@ -190,9 +214,17 @@ module ClassifierReborn
190
214
  # Provides a list of category names
191
215
  # For example:
192
216
  # b.categories
193
- # => ['This', 'That', 'the_other']
194
- def categories # :nodoc:
195
- @categories.keys.collect(&:to_s)
217
+ # => ["This", "That", "The other"]
218
+ def categories
219
+ category_keys.collect(&:to_s)
220
+ end
221
+
222
+ # Provides a list of category keys as symbols
223
+ # For example:
224
+ # b.categories
225
+ # => [:This, :That, :"The other"]
226
+ def category_keys
227
+ @backend.category_keys
196
228
  end
197
229
 
198
230
  # Allows you to add categories to the classifier.
@@ -204,9 +236,37 @@ module ClassifierReborn
204
236
  # more criteria than the trained selective categories. In short,
205
237
  # try to initialize your categories at initialization.
206
238
  def add_category(category)
207
- @categories[CategoryNamer.prepare_name(category)] ||= Hash.new(0)
239
+ category = CategoryNamer.prepare_name(category)
240
+ @backend.add_category(category)
208
241
  end
209
242
 
210
243
  alias_method :append_category, :add_category
244
+
245
+ def reset
246
+ @backend.reset
247
+ populate_initial_categories
248
+ end
249
+
250
+ private
251
+
252
+ def populate_initial_categories
253
+ @initial_categories.each do |c|
254
+ add_category(c)
255
+ end
256
+ end
257
+
258
+ # Overwrites the default stopwords for current language with supplied list of stopwords or file
259
+ def custom_stopwords(stopwords)
260
+ unless stopwords.is_a?(Enumerable)
261
+ if stopwords.strip.empty?
262
+ stopwords = []
263
+ elsif File.exist?(stopwords)
264
+ stopwords = File.read(stopwords).force_encoding("utf-8").split
265
+ else
266
+ return # Do not overwrite the default
267
+ end
268
+ end
269
+ Hasher::STOPWORDS[@language] = Set.new stopwords
270
+ end
211
271
  end
212
272
  end