classifier-reborn 2.1.0 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +74 -1
- data/README.markdown +57 -227
- data/data/stopwords/ar +104 -0
- data/data/stopwords/bn +362 -0
- data/data/stopwords/hi +97 -0
- data/data/stopwords/ja +43 -0
- data/data/stopwords/ru +420 -0
- data/data/stopwords/tr +199 -30
- data/data/stopwords/vi +647 -0
- data/data/stopwords/zh +125 -0
- data/lib/classifier-reborn.rb +9 -0
- data/lib/classifier-reborn/backends/bayes_memory_backend.rb +75 -0
- data/lib/classifier-reborn/backends/bayes_redis_backend.rb +107 -0
- data/lib/classifier-reborn/backends/no_redis_error.rb +12 -0
- data/lib/classifier-reborn/bayes.rb +98 -38
- data/lib/classifier-reborn/category_namer.rb +0 -1
- data/lib/classifier-reborn/extensions/hasher.rb +1 -1
- data/lib/classifier-reborn/lsi.rb +5 -1
- data/lib/classifier-reborn/lsi/word_list.rb +2 -4
- data/lib/classifier-reborn/validators/classifier_validator.rb +169 -0
- data/lib/classifier-reborn/version.rb +1 -1
- metadata +30 -8
- data/bin/bayes.rb +0 -36
- data/bin/summarize.rb +0 -16
data/data/stopwords/zh
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
|
2
|
+
的
|
3
|
+
一
|
4
|
+
不
|
5
|
+
在
|
6
|
+
人
|
7
|
+
有
|
8
|
+
是
|
9
|
+
为
|
10
|
+
以
|
11
|
+
于
|
12
|
+
上
|
13
|
+
他
|
14
|
+
而
|
15
|
+
后
|
16
|
+
之
|
17
|
+
来
|
18
|
+
及
|
19
|
+
了
|
20
|
+
因
|
21
|
+
下
|
22
|
+
可
|
23
|
+
到
|
24
|
+
由
|
25
|
+
这
|
26
|
+
与
|
27
|
+
也
|
28
|
+
此
|
29
|
+
但
|
30
|
+
并
|
31
|
+
个
|
32
|
+
其
|
33
|
+
已
|
34
|
+
无
|
35
|
+
小
|
36
|
+
我
|
37
|
+
们
|
38
|
+
起
|
39
|
+
最
|
40
|
+
再
|
41
|
+
今
|
42
|
+
去
|
43
|
+
好
|
44
|
+
只
|
45
|
+
又
|
46
|
+
或
|
47
|
+
很
|
48
|
+
亦
|
49
|
+
某
|
50
|
+
把
|
51
|
+
那
|
52
|
+
你
|
53
|
+
乃
|
54
|
+
它
|
55
|
+
吧
|
56
|
+
被
|
57
|
+
比
|
58
|
+
别
|
59
|
+
趁
|
60
|
+
当
|
61
|
+
从
|
62
|
+
到
|
63
|
+
得 打
|
64
|
+
凡
|
65
|
+
儿
|
66
|
+
尔
|
67
|
+
该
|
68
|
+
各
|
69
|
+
给
|
70
|
+
跟
|
71
|
+
和
|
72
|
+
何
|
73
|
+
还
|
74
|
+
即
|
75
|
+
几
|
76
|
+
既
|
77
|
+
看
|
78
|
+
据
|
79
|
+
距
|
80
|
+
靠
|
81
|
+
啦
|
82
|
+
了
|
83
|
+
另
|
84
|
+
么
|
85
|
+
每
|
86
|
+
们
|
87
|
+
嘛
|
88
|
+
拿
|
89
|
+
哪
|
90
|
+
那
|
91
|
+
您
|
92
|
+
凭
|
93
|
+
且
|
94
|
+
却
|
95
|
+
让
|
96
|
+
仍
|
97
|
+
啥
|
98
|
+
如
|
99
|
+
若
|
100
|
+
使
|
101
|
+
谁
|
102
|
+
虽
|
103
|
+
随
|
104
|
+
同
|
105
|
+
所
|
106
|
+
她
|
107
|
+
哇
|
108
|
+
嗡
|
109
|
+
往
|
110
|
+
哪
|
111
|
+
些
|
112
|
+
向
|
113
|
+
沿
|
114
|
+
哟
|
115
|
+
用
|
116
|
+
于
|
117
|
+
咱
|
118
|
+
则
|
119
|
+
怎
|
120
|
+
曾
|
121
|
+
至
|
122
|
+
致
|
123
|
+
着
|
124
|
+
诸
|
125
|
+
自
|
data/lib/classifier-reborn.rb
CHANGED
@@ -25,6 +25,15 @@
|
|
25
25
|
# License:: LGPL
|
26
26
|
|
27
27
|
require 'rubygems'
|
28
|
+
|
29
|
+
case RUBY_PLATFORM
|
30
|
+
when 'java'
|
31
|
+
require 'jruby-stemmer'
|
32
|
+
else
|
33
|
+
require 'fast-stemmer'
|
34
|
+
end
|
35
|
+
|
28
36
|
require_relative 'classifier-reborn/category_namer'
|
29
37
|
require_relative 'classifier-reborn/bayes'
|
30
38
|
require_relative 'classifier-reborn/lsi'
|
39
|
+
require_relative 'classifier-reborn/validators/classifier_validator'
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module ClassifierReborn
|
2
|
+
class BayesMemoryBackend
|
3
|
+
attr_reader :total_words, :total_trainings
|
4
|
+
|
5
|
+
# This class provides Memory as the storage backend for the classifier data structures
|
6
|
+
def initialize
|
7
|
+
@total_words = 0
|
8
|
+
@total_trainings = 0
|
9
|
+
@category_counts = {}
|
10
|
+
@categories = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def update_total_words(diff)
|
14
|
+
@total_words += diff
|
15
|
+
end
|
16
|
+
|
17
|
+
def update_total_trainings(diff)
|
18
|
+
@total_trainings += diff
|
19
|
+
end
|
20
|
+
|
21
|
+
def category_training_count(category)
|
22
|
+
category_counts(category)[:training]
|
23
|
+
end
|
24
|
+
|
25
|
+
def update_category_training_count(category, diff)
|
26
|
+
category_counts(category)[:training] += diff
|
27
|
+
end
|
28
|
+
|
29
|
+
def category_has_trainings?(category)
|
30
|
+
@category_counts.key?(category) && category_training_count(category) > 0
|
31
|
+
end
|
32
|
+
|
33
|
+
def category_word_count(category)
|
34
|
+
category_counts(category)[:word]
|
35
|
+
end
|
36
|
+
|
37
|
+
def update_category_word_count(category, diff)
|
38
|
+
category_counts(category)[:word] += diff
|
39
|
+
end
|
40
|
+
|
41
|
+
def add_category(category)
|
42
|
+
@categories[category] ||= Hash.new(0)
|
43
|
+
end
|
44
|
+
|
45
|
+
def category_keys
|
46
|
+
@categories.keys
|
47
|
+
end
|
48
|
+
|
49
|
+
def category_word_frequency(category, word)
|
50
|
+
@categories[category][word]
|
51
|
+
end
|
52
|
+
|
53
|
+
def update_category_word_frequency(category, word, diff)
|
54
|
+
@categories[category][word] += diff
|
55
|
+
end
|
56
|
+
|
57
|
+
def delete_category_word(category, word)
|
58
|
+
@categories[category].delete(word)
|
59
|
+
end
|
60
|
+
|
61
|
+
def word_in_category?(category, word)
|
62
|
+
@categories[category].key?(word)
|
63
|
+
end
|
64
|
+
|
65
|
+
def reset
|
66
|
+
initialize
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def category_counts(category)
|
72
|
+
@category_counts[category] ||= {training: 0, word: 0}
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require_relative 'no_redis_error'
|
2
|
+
# require redis when we run #intialize. This way only people using this backend
|
3
|
+
# will need to install and load the backend without having to
|
4
|
+
# require 'classifier-reborn/backends/bayes_redis_backend'
|
5
|
+
|
6
|
+
module ClassifierReborn
|
7
|
+
# This class provides Redis as the storage backend for the classifier data structures
|
8
|
+
class BayesRedisBackend
|
9
|
+
# The class can be created with the same arguments that the redis gem accepts
|
10
|
+
# E.g.,
|
11
|
+
# b = ClassifierReborn::BayesRedisBackend.new
|
12
|
+
# b = ClassifierReborn::BayesRedisBackend.new host: "10.0.1.1", port: 6380, db: 2
|
13
|
+
# b = ClassifierReborn::BayesRedisBackend.new url: "redis://:secret@10.0.1.1:6380/2"
|
14
|
+
#
|
15
|
+
# Options available are:
|
16
|
+
# url: lambda { ENV["REDIS_URL"] }
|
17
|
+
# scheme: "redis"
|
18
|
+
# host: "127.0.0.1"
|
19
|
+
# port: 6379
|
20
|
+
# path: nil
|
21
|
+
# timeout: 5.0
|
22
|
+
# password: nil
|
23
|
+
# db: 0
|
24
|
+
# driver: nil
|
25
|
+
# id: nil
|
26
|
+
# tcp_keepalive: 0
|
27
|
+
# reconnect_attempts: 1
|
28
|
+
# inherit_socket: false
|
29
|
+
def initialize(options = {})
|
30
|
+
begin # because some people don't have redis installed
|
31
|
+
require 'redis'
|
32
|
+
rescue LoadError
|
33
|
+
raise NoRedisError
|
34
|
+
end
|
35
|
+
|
36
|
+
@redis = Redis.new(options)
|
37
|
+
@redis.setnx(:total_words, 0)
|
38
|
+
@redis.setnx(:total_trainings, 0)
|
39
|
+
end
|
40
|
+
|
41
|
+
def total_words
|
42
|
+
@redis.get(:total_words).to_i
|
43
|
+
end
|
44
|
+
|
45
|
+
def update_total_words(diff)
|
46
|
+
@redis.incrby(:total_words, diff)
|
47
|
+
end
|
48
|
+
|
49
|
+
def total_trainings
|
50
|
+
@redis.get(:total_trainings).to_i
|
51
|
+
end
|
52
|
+
|
53
|
+
def update_total_trainings(diff)
|
54
|
+
@redis.incrby(:total_trainings, diff)
|
55
|
+
end
|
56
|
+
|
57
|
+
def category_training_count(category)
|
58
|
+
@redis.hget(:category_training_count, category).to_i
|
59
|
+
end
|
60
|
+
|
61
|
+
def update_category_training_count(category, diff)
|
62
|
+
@redis.hincrby(:category_training_count, category, diff)
|
63
|
+
end
|
64
|
+
|
65
|
+
def category_has_trainings?(category)
|
66
|
+
category_training_count(category) > 0
|
67
|
+
end
|
68
|
+
|
69
|
+
def category_word_count(category)
|
70
|
+
@redis.hget(:category_word_count, category).to_i
|
71
|
+
end
|
72
|
+
|
73
|
+
def update_category_word_count(category, diff)
|
74
|
+
@redis.hincrby(:category_word_count, category, diff)
|
75
|
+
end
|
76
|
+
|
77
|
+
def add_category(category)
|
78
|
+
@redis.sadd(:category_keys, category)
|
79
|
+
end
|
80
|
+
|
81
|
+
def category_keys
|
82
|
+
@redis.smembers(:category_keys).map(&:intern)
|
83
|
+
end
|
84
|
+
|
85
|
+
def category_word_frequency(category, word)
|
86
|
+
@redis.hget(category, word).to_i
|
87
|
+
end
|
88
|
+
|
89
|
+
def update_category_word_frequency(category, word, diff)
|
90
|
+
@redis.hincrby(category, word, diff)
|
91
|
+
end
|
92
|
+
|
93
|
+
def delete_category_word(category, word)
|
94
|
+
@redis.hdel(category, word)
|
95
|
+
end
|
96
|
+
|
97
|
+
def word_in_category?(category, word)
|
98
|
+
@redis.hexists(category, word)
|
99
|
+
end
|
100
|
+
|
101
|
+
def reset
|
102
|
+
@redis.flushdb
|
103
|
+
@redis.set(:total_words, 0)
|
104
|
+
@redis.set(:total_trainings, 0)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
class NoRedisError < LoadError
|
2
|
+
def initialize
|
3
|
+
msg =
|
4
|
+
%q{The Redis Backend can only be used if Redis is installed.
|
5
|
+
This error is raised from 'lib/classifier-reborn/backends/bayes_redis_backend.rb'.
|
6
|
+
If you have encountered this error and would like to use the Redis Backend,
|
7
|
+
please run 'gem install redis' or include 'gem "redis"' in
|
8
|
+
your gemfile. For more info see https://github.com/jekyll/classifier-reborn#usage.
|
9
|
+
}
|
10
|
+
super(msg)
|
11
|
+
end
|
12
|
+
end
|
@@ -2,7 +2,11 @@
|
|
2
2
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
|
+
require 'set'
|
6
|
+
|
5
7
|
require_relative 'category_namer'
|
8
|
+
require_relative 'backends/bayes_memory_backend'
|
9
|
+
require_relative 'backends/bayes_redis_backend'
|
6
10
|
|
7
11
|
module ClassifierReborn
|
8
12
|
class Bayes
|
@@ -13,36 +17,45 @@ module ClassifierReborn
|
|
13
17
|
# b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
14
18
|
#
|
15
19
|
# Options available are:
|
16
|
-
# language: 'en'
|
17
|
-
# auto_categorize: false
|
18
|
-
# enable_threshold: false
|
19
|
-
# threshold: 0.0
|
20
|
-
# enable_stemmer: true
|
20
|
+
# language: 'en' Used to select language specific stop words
|
21
|
+
# auto_categorize: false When true, enables ability to dynamically declare a category; the default is true if no initial categories are provided
|
22
|
+
# enable_threshold: false When true, enables a threshold requirement for classifition
|
23
|
+
# threshold: 0.0 Default threshold, only used when enabled
|
24
|
+
# enable_stemmer: true When false, disables word stemming
|
25
|
+
# stopwords: nil Accepts path to a text file or an array of words, when supplied, overwrites the default stopwords; assign empty string or array to disable stopwords
|
26
|
+
# backend: BayesMemoryBackend.new Alternatively, BayesRedisBackend.new for persistent storage
|
21
27
|
def initialize(*args)
|
22
|
-
@
|
28
|
+
@initial_categories = []
|
23
29
|
options = { language: 'en',
|
24
|
-
auto_categorize: false,
|
25
30
|
enable_threshold: false,
|
26
31
|
threshold: 0.0,
|
27
|
-
enable_stemmer: true
|
32
|
+
enable_stemmer: true,
|
33
|
+
backend: BayesMemoryBackend.new
|
28
34
|
}
|
29
35
|
args.flatten.each do |arg|
|
30
36
|
if arg.is_a?(Hash)
|
31
37
|
options.merge!(arg)
|
32
38
|
else
|
33
|
-
|
39
|
+
@initial_categories.push(arg)
|
34
40
|
end
|
35
41
|
end
|
36
42
|
|
37
|
-
|
38
|
-
|
39
|
-
|
43
|
+
unless options.key?(:auto_categorize)
|
44
|
+
options[:auto_categorize] = @initial_categories.empty? ? true : false
|
45
|
+
end
|
40
46
|
|
41
47
|
@language = options[:language]
|
42
48
|
@auto_categorize = options[:auto_categorize]
|
43
49
|
@enable_threshold = options[:enable_threshold]
|
44
50
|
@threshold = options[:threshold]
|
45
51
|
@enable_stemmer = options[:enable_stemmer]
|
52
|
+
@backend = options[:backend]
|
53
|
+
|
54
|
+
populate_initial_categories
|
55
|
+
|
56
|
+
if options.key?(:stopwords)
|
57
|
+
custom_stopwords options[:stopwords]
|
58
|
+
end
|
46
59
|
end
|
47
60
|
|
48
61
|
# Provides a general training method for all categories specified in Bayes#new
|
@@ -52,10 +65,12 @@ module ClassifierReborn
|
|
52
65
|
# b.train "that", "That text"
|
53
66
|
# b.train "The other", "The other text"
|
54
67
|
def train(category, text)
|
68
|
+
word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
|
69
|
+
return if word_hash.empty?
|
55
70
|
category = CategoryNamer.prepare_name(category)
|
56
71
|
|
57
72
|
# Add the category dynamically or raise an error
|
58
|
-
unless
|
73
|
+
unless category_keys.include?(category)
|
59
74
|
if @auto_categorize
|
60
75
|
add_category(category)
|
61
76
|
else
|
@@ -63,12 +78,13 @@ module ClassifierReborn
|
|
63
78
|
end
|
64
79
|
end
|
65
80
|
|
66
|
-
|
67
|
-
|
68
|
-
@
|
69
|
-
@
|
70
|
-
@total_words += count
|
81
|
+
word_hash.each do |word, count|
|
82
|
+
@backend.update_category_word_frequency(category, word, count)
|
83
|
+
@backend.update_category_word_count(category, count)
|
84
|
+
@backend.update_total_words(count)
|
71
85
|
end
|
86
|
+
@backend.update_total_trainings(1)
|
87
|
+
@backend.update_category_training_count(category, 1)
|
72
88
|
end
|
73
89
|
|
74
90
|
# Provides a untraining method for all categories specified in Bayes#new
|
@@ -79,20 +95,23 @@ module ClassifierReborn
|
|
79
95
|
# b.train :this, "This text"
|
80
96
|
# b.untrain :this, "This text"
|
81
97
|
def untrain(category, text)
|
98
|
+
word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
|
99
|
+
return if word_hash.empty?
|
82
100
|
category = CategoryNamer.prepare_name(category)
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
@
|
88
|
-
|
89
|
-
@categories[category].delete(word)
|
101
|
+
word_hash.each do |word, count|
|
102
|
+
next if @backend.total_words < 0
|
103
|
+
orig = @backend.category_word_frequency(category, word) || 0
|
104
|
+
@backend.update_category_word_frequency(category, word, -count)
|
105
|
+
if @backend.category_word_frequency(category, word) <= 0
|
106
|
+
@backend.delete_category_word(category, word)
|
90
107
|
count = orig
|
91
108
|
end
|
92
109
|
|
93
|
-
@
|
94
|
-
@
|
110
|
+
@backend.update_category_word_count(category, -count) if @backend.category_word_count(category) >= count
|
111
|
+
@backend.update_total_words(-count)
|
95
112
|
end
|
113
|
+
@backend.update_total_trainings(-1)
|
114
|
+
@backend.update_category_training_count(category, -1)
|
96
115
|
end
|
97
116
|
|
98
117
|
# Returns the scores in each category the provided +text+. E.g.,
|
@@ -102,17 +121,22 @@ module ClassifierReborn
|
|
102
121
|
def classifications(text)
|
103
122
|
score = {}
|
104
123
|
word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
|
105
|
-
|
106
|
-
|
124
|
+
if word_hash.empty?
|
125
|
+
category_keys.each do |category|
|
126
|
+
score[category.to_s] = Float::INFINITY
|
127
|
+
end
|
128
|
+
return score
|
129
|
+
end
|
130
|
+
category_keys.each do |category|
|
107
131
|
score[category.to_s] = 0
|
108
|
-
total = (@category_word_count
|
132
|
+
total = (@backend.category_word_count(category) || 1).to_f
|
109
133
|
word_hash.each do |word, _count|
|
110
|
-
s =
|
134
|
+
s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1
|
111
135
|
score[category.to_s] += Math.log(s / total)
|
112
136
|
end
|
113
137
|
# now add prior probability for the category
|
114
|
-
s = @
|
115
|
-
score[category.to_s] += Math.log(s /
|
138
|
+
s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1
|
139
|
+
score[category.to_s] += Math.log(s / @backend.total_trainings.to_f)
|
116
140
|
end
|
117
141
|
score
|
118
142
|
end
|
@@ -178,7 +202,7 @@ module ClassifierReborn
|
|
178
202
|
def method_missing(name, *args)
|
179
203
|
cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
|
180
204
|
category = CategoryNamer.prepare_name(cleaned_name)
|
181
|
-
if
|
205
|
+
if category_keys.include?(category)
|
182
206
|
args.each { |text| eval("#{Regexp.last_match(1)}train(category, text)") }
|
183
207
|
elsif name.to_s =~ /(un)?train_([\w]+)/
|
184
208
|
raise StandardError, "No such category: #{category}"
|
@@ -190,9 +214,17 @@ module ClassifierReborn
|
|
190
214
|
# Provides a list of category names
|
191
215
|
# For example:
|
192
216
|
# b.categories
|
193
|
-
# => [
|
194
|
-
def categories
|
195
|
-
|
217
|
+
# => ["This", "That", "The other"]
|
218
|
+
def categories
|
219
|
+
category_keys.collect(&:to_s)
|
220
|
+
end
|
221
|
+
|
222
|
+
# Provides a list of category keys as symbols
|
223
|
+
# For example:
|
224
|
+
# b.categories
|
225
|
+
# => [:This, :That, :"The other"]
|
226
|
+
def category_keys
|
227
|
+
@backend.category_keys
|
196
228
|
end
|
197
229
|
|
198
230
|
# Allows you to add categories to the classifier.
|
@@ -204,9 +236,37 @@ module ClassifierReborn
|
|
204
236
|
# more criteria than the trained selective categories. In short,
|
205
237
|
# try to initialize your categories at initialization.
|
206
238
|
def add_category(category)
|
207
|
-
|
239
|
+
category = CategoryNamer.prepare_name(category)
|
240
|
+
@backend.add_category(category)
|
208
241
|
end
|
209
242
|
|
210
243
|
alias_method :append_category, :add_category
|
244
|
+
|
245
|
+
def reset
|
246
|
+
@backend.reset
|
247
|
+
populate_initial_categories
|
248
|
+
end
|
249
|
+
|
250
|
+
private
|
251
|
+
|
252
|
+
def populate_initial_categories
|
253
|
+
@initial_categories.each do |c|
|
254
|
+
add_category(c)
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
# Overwrites the default stopwords for current language with supplied list of stopwords or file
|
259
|
+
def custom_stopwords(stopwords)
|
260
|
+
unless stopwords.is_a?(Enumerable)
|
261
|
+
if stopwords.strip.empty?
|
262
|
+
stopwords = []
|
263
|
+
elsif File.exist?(stopwords)
|
264
|
+
stopwords = File.read(stopwords).force_encoding("utf-8").split
|
265
|
+
else
|
266
|
+
return # Do not overwrite the default
|
267
|
+
end
|
268
|
+
end
|
269
|
+
Hasher::STOPWORDS[@language] = Set.new stopwords
|
270
|
+
end
|
211
271
|
end
|
212
272
|
end
|