classifier-reborn 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +74 -1
- data/README.markdown +57 -227
- data/data/stopwords/ar +104 -0
- data/data/stopwords/bn +362 -0
- data/data/stopwords/hi +97 -0
- data/data/stopwords/ja +43 -0
- data/data/stopwords/ru +420 -0
- data/data/stopwords/tr +199 -30
- data/data/stopwords/vi +647 -0
- data/data/stopwords/zh +125 -0
- data/lib/classifier-reborn.rb +9 -0
- data/lib/classifier-reborn/backends/bayes_memory_backend.rb +75 -0
- data/lib/classifier-reborn/backends/bayes_redis_backend.rb +107 -0
- data/lib/classifier-reborn/backends/no_redis_error.rb +12 -0
- data/lib/classifier-reborn/bayes.rb +98 -38
- data/lib/classifier-reborn/category_namer.rb +0 -1
- data/lib/classifier-reborn/extensions/hasher.rb +1 -1
- data/lib/classifier-reborn/lsi.rb +5 -1
- data/lib/classifier-reborn/lsi/word_list.rb +2 -4
- data/lib/classifier-reborn/validators/classifier_validator.rb +169 -0
- data/lib/classifier-reborn/version.rb +1 -1
- metadata +30 -8
- data/bin/bayes.rb +0 -36
- data/bin/summarize.rb +0 -16
data/data/stopwords/zh
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
|
2
|
+
的
|
3
|
+
一
|
4
|
+
不
|
5
|
+
在
|
6
|
+
人
|
7
|
+
有
|
8
|
+
是
|
9
|
+
为
|
10
|
+
以
|
11
|
+
于
|
12
|
+
上
|
13
|
+
他
|
14
|
+
而
|
15
|
+
后
|
16
|
+
之
|
17
|
+
来
|
18
|
+
及
|
19
|
+
了
|
20
|
+
因
|
21
|
+
下
|
22
|
+
可
|
23
|
+
到
|
24
|
+
由
|
25
|
+
这
|
26
|
+
与
|
27
|
+
也
|
28
|
+
此
|
29
|
+
但
|
30
|
+
并
|
31
|
+
个
|
32
|
+
其
|
33
|
+
已
|
34
|
+
无
|
35
|
+
小
|
36
|
+
我
|
37
|
+
们
|
38
|
+
起
|
39
|
+
最
|
40
|
+
再
|
41
|
+
今
|
42
|
+
去
|
43
|
+
好
|
44
|
+
只
|
45
|
+
又
|
46
|
+
或
|
47
|
+
很
|
48
|
+
亦
|
49
|
+
某
|
50
|
+
把
|
51
|
+
那
|
52
|
+
你
|
53
|
+
乃
|
54
|
+
它
|
55
|
+
吧
|
56
|
+
被
|
57
|
+
比
|
58
|
+
别
|
59
|
+
趁
|
60
|
+
当
|
61
|
+
从
|
62
|
+
到
|
63
|
+
得 打
|
64
|
+
凡
|
65
|
+
儿
|
66
|
+
尔
|
67
|
+
该
|
68
|
+
各
|
69
|
+
给
|
70
|
+
跟
|
71
|
+
和
|
72
|
+
何
|
73
|
+
还
|
74
|
+
即
|
75
|
+
几
|
76
|
+
既
|
77
|
+
看
|
78
|
+
据
|
79
|
+
距
|
80
|
+
靠
|
81
|
+
啦
|
82
|
+
了
|
83
|
+
另
|
84
|
+
么
|
85
|
+
每
|
86
|
+
们
|
87
|
+
嘛
|
88
|
+
拿
|
89
|
+
哪
|
90
|
+
那
|
91
|
+
您
|
92
|
+
凭
|
93
|
+
且
|
94
|
+
却
|
95
|
+
让
|
96
|
+
仍
|
97
|
+
啥
|
98
|
+
如
|
99
|
+
若
|
100
|
+
使
|
101
|
+
谁
|
102
|
+
虽
|
103
|
+
随
|
104
|
+
同
|
105
|
+
所
|
106
|
+
她
|
107
|
+
哇
|
108
|
+
嗡
|
109
|
+
往
|
110
|
+
哪
|
111
|
+
些
|
112
|
+
向
|
113
|
+
沿
|
114
|
+
哟
|
115
|
+
用
|
116
|
+
于
|
117
|
+
咱
|
118
|
+
则
|
119
|
+
怎
|
120
|
+
曾
|
121
|
+
至
|
122
|
+
致
|
123
|
+
着
|
124
|
+
诸
|
125
|
+
自
|
data/lib/classifier-reborn.rb
CHANGED
@@ -25,6 +25,15 @@
|
|
25
25
|
# License:: LGPL
|
26
26
|
|
27
27
|
require 'rubygems'
|
28
|
+
|
29
|
+
case RUBY_PLATFORM
|
30
|
+
when 'java'
|
31
|
+
require 'jruby-stemmer'
|
32
|
+
else
|
33
|
+
require 'fast-stemmer'
|
34
|
+
end
|
35
|
+
|
28
36
|
require_relative 'classifier-reborn/category_namer'
|
29
37
|
require_relative 'classifier-reborn/bayes'
|
30
38
|
require_relative 'classifier-reborn/lsi'
|
39
|
+
require_relative 'classifier-reborn/validators/classifier_validator'
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module ClassifierReborn
|
2
|
+
class BayesMemoryBackend
|
3
|
+
attr_reader :total_words, :total_trainings
|
4
|
+
|
5
|
+
# This class provides Memory as the storage backend for the classifier data structures
|
6
|
+
def initialize
|
7
|
+
@total_words = 0
|
8
|
+
@total_trainings = 0
|
9
|
+
@category_counts = {}
|
10
|
+
@categories = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def update_total_words(diff)
|
14
|
+
@total_words += diff
|
15
|
+
end
|
16
|
+
|
17
|
+
def update_total_trainings(diff)
|
18
|
+
@total_trainings += diff
|
19
|
+
end
|
20
|
+
|
21
|
+
def category_training_count(category)
|
22
|
+
category_counts(category)[:training]
|
23
|
+
end
|
24
|
+
|
25
|
+
def update_category_training_count(category, diff)
|
26
|
+
category_counts(category)[:training] += diff
|
27
|
+
end
|
28
|
+
|
29
|
+
def category_has_trainings?(category)
|
30
|
+
@category_counts.key?(category) && category_training_count(category) > 0
|
31
|
+
end
|
32
|
+
|
33
|
+
def category_word_count(category)
|
34
|
+
category_counts(category)[:word]
|
35
|
+
end
|
36
|
+
|
37
|
+
def update_category_word_count(category, diff)
|
38
|
+
category_counts(category)[:word] += diff
|
39
|
+
end
|
40
|
+
|
41
|
+
def add_category(category)
|
42
|
+
@categories[category] ||= Hash.new(0)
|
43
|
+
end
|
44
|
+
|
45
|
+
def category_keys
|
46
|
+
@categories.keys
|
47
|
+
end
|
48
|
+
|
49
|
+
def category_word_frequency(category, word)
|
50
|
+
@categories[category][word]
|
51
|
+
end
|
52
|
+
|
53
|
+
def update_category_word_frequency(category, word, diff)
|
54
|
+
@categories[category][word] += diff
|
55
|
+
end
|
56
|
+
|
57
|
+
def delete_category_word(category, word)
|
58
|
+
@categories[category].delete(word)
|
59
|
+
end
|
60
|
+
|
61
|
+
def word_in_category?(category, word)
|
62
|
+
@categories[category].key?(word)
|
63
|
+
end
|
64
|
+
|
65
|
+
def reset
|
66
|
+
initialize
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def category_counts(category)
|
72
|
+
@category_counts[category] ||= {training: 0, word: 0}
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require_relative 'no_redis_error'
|
2
|
+
# require redis when we run #intialize. This way only people using this backend
|
3
|
+
# will need to install and load the backend without having to
|
4
|
+
# require 'classifier-reborn/backends/bayes_redis_backend'
|
5
|
+
|
6
|
+
module ClassifierReborn
|
7
|
+
# This class provides Redis as the storage backend for the classifier data structures
|
8
|
+
class BayesRedisBackend
|
9
|
+
# The class can be created with the same arguments that the redis gem accepts
|
10
|
+
# E.g.,
|
11
|
+
# b = ClassifierReborn::BayesRedisBackend.new
|
12
|
+
# b = ClassifierReborn::BayesRedisBackend.new host: "10.0.1.1", port: 6380, db: 2
|
13
|
+
# b = ClassifierReborn::BayesRedisBackend.new url: "redis://:secret@10.0.1.1:6380/2"
|
14
|
+
#
|
15
|
+
# Options available are:
|
16
|
+
# url: lambda { ENV["REDIS_URL"] }
|
17
|
+
# scheme: "redis"
|
18
|
+
# host: "127.0.0.1"
|
19
|
+
# port: 6379
|
20
|
+
# path: nil
|
21
|
+
# timeout: 5.0
|
22
|
+
# password: nil
|
23
|
+
# db: 0
|
24
|
+
# driver: nil
|
25
|
+
# id: nil
|
26
|
+
# tcp_keepalive: 0
|
27
|
+
# reconnect_attempts: 1
|
28
|
+
# inherit_socket: false
|
29
|
+
def initialize(options = {})
|
30
|
+
begin # because some people don't have redis installed
|
31
|
+
require 'redis'
|
32
|
+
rescue LoadError
|
33
|
+
raise NoRedisError
|
34
|
+
end
|
35
|
+
|
36
|
+
@redis = Redis.new(options)
|
37
|
+
@redis.setnx(:total_words, 0)
|
38
|
+
@redis.setnx(:total_trainings, 0)
|
39
|
+
end
|
40
|
+
|
41
|
+
def total_words
|
42
|
+
@redis.get(:total_words).to_i
|
43
|
+
end
|
44
|
+
|
45
|
+
def update_total_words(diff)
|
46
|
+
@redis.incrby(:total_words, diff)
|
47
|
+
end
|
48
|
+
|
49
|
+
def total_trainings
|
50
|
+
@redis.get(:total_trainings).to_i
|
51
|
+
end
|
52
|
+
|
53
|
+
def update_total_trainings(diff)
|
54
|
+
@redis.incrby(:total_trainings, diff)
|
55
|
+
end
|
56
|
+
|
57
|
+
def category_training_count(category)
|
58
|
+
@redis.hget(:category_training_count, category).to_i
|
59
|
+
end
|
60
|
+
|
61
|
+
def update_category_training_count(category, diff)
|
62
|
+
@redis.hincrby(:category_training_count, category, diff)
|
63
|
+
end
|
64
|
+
|
65
|
+
def category_has_trainings?(category)
|
66
|
+
category_training_count(category) > 0
|
67
|
+
end
|
68
|
+
|
69
|
+
def category_word_count(category)
|
70
|
+
@redis.hget(:category_word_count, category).to_i
|
71
|
+
end
|
72
|
+
|
73
|
+
def update_category_word_count(category, diff)
|
74
|
+
@redis.hincrby(:category_word_count, category, diff)
|
75
|
+
end
|
76
|
+
|
77
|
+
def add_category(category)
|
78
|
+
@redis.sadd(:category_keys, category)
|
79
|
+
end
|
80
|
+
|
81
|
+
def category_keys
|
82
|
+
@redis.smembers(:category_keys).map(&:intern)
|
83
|
+
end
|
84
|
+
|
85
|
+
def category_word_frequency(category, word)
|
86
|
+
@redis.hget(category, word).to_i
|
87
|
+
end
|
88
|
+
|
89
|
+
def update_category_word_frequency(category, word, diff)
|
90
|
+
@redis.hincrby(category, word, diff)
|
91
|
+
end
|
92
|
+
|
93
|
+
def delete_category_word(category, word)
|
94
|
+
@redis.hdel(category, word)
|
95
|
+
end
|
96
|
+
|
97
|
+
def word_in_category?(category, word)
|
98
|
+
@redis.hexists(category, word)
|
99
|
+
end
|
100
|
+
|
101
|
+
def reset
|
102
|
+
@redis.flushdb
|
103
|
+
@redis.set(:total_words, 0)
|
104
|
+
@redis.set(:total_trainings, 0)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
class NoRedisError < LoadError
|
2
|
+
def initialize
|
3
|
+
msg =
|
4
|
+
%q{The Redis Backend can only be used if Redis is installed.
|
5
|
+
This error is raised from 'lib/classifier-reborn/backends/bayes_redis_backend.rb'.
|
6
|
+
If you have encountered this error and would like to use the Redis Backend,
|
7
|
+
please run 'gem install redis' or include 'gem "redis"' in
|
8
|
+
your gemfile. For more info see https://github.com/jekyll/classifier-reborn#usage.
|
9
|
+
}
|
10
|
+
super(msg)
|
11
|
+
end
|
12
|
+
end
|
@@ -2,7 +2,11 @@
|
|
2
2
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
|
+
require 'set'
|
6
|
+
|
5
7
|
require_relative 'category_namer'
|
8
|
+
require_relative 'backends/bayes_memory_backend'
|
9
|
+
require_relative 'backends/bayes_redis_backend'
|
6
10
|
|
7
11
|
module ClassifierReborn
|
8
12
|
class Bayes
|
@@ -13,36 +17,45 @@ module ClassifierReborn
|
|
13
17
|
# b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
14
18
|
#
|
15
19
|
# Options available are:
|
16
|
-
# language: 'en'
|
17
|
-
# auto_categorize: false
|
18
|
-
# enable_threshold: false
|
19
|
-
# threshold: 0.0
|
20
|
-
# enable_stemmer: true
|
20
|
+
# language: 'en' Used to select language specific stop words
|
21
|
+
# auto_categorize: false When true, enables ability to dynamically declare a category; the default is true if no initial categories are provided
|
22
|
+
# enable_threshold: false When true, enables a threshold requirement for classifition
|
23
|
+
# threshold: 0.0 Default threshold, only used when enabled
|
24
|
+
# enable_stemmer: true When false, disables word stemming
|
25
|
+
# stopwords: nil Accepts path to a text file or an array of words, when supplied, overwrites the default stopwords; assign empty string or array to disable stopwords
|
26
|
+
# backend: BayesMemoryBackend.new Alternatively, BayesRedisBackend.new for persistent storage
|
21
27
|
def initialize(*args)
|
22
|
-
@
|
28
|
+
@initial_categories = []
|
23
29
|
options = { language: 'en',
|
24
|
-
auto_categorize: false,
|
25
30
|
enable_threshold: false,
|
26
31
|
threshold: 0.0,
|
27
|
-
enable_stemmer: true
|
32
|
+
enable_stemmer: true,
|
33
|
+
backend: BayesMemoryBackend.new
|
28
34
|
}
|
29
35
|
args.flatten.each do |arg|
|
30
36
|
if arg.is_a?(Hash)
|
31
37
|
options.merge!(arg)
|
32
38
|
else
|
33
|
-
|
39
|
+
@initial_categories.push(arg)
|
34
40
|
end
|
35
41
|
end
|
36
42
|
|
37
|
-
|
38
|
-
|
39
|
-
|
43
|
+
unless options.key?(:auto_categorize)
|
44
|
+
options[:auto_categorize] = @initial_categories.empty? ? true : false
|
45
|
+
end
|
40
46
|
|
41
47
|
@language = options[:language]
|
42
48
|
@auto_categorize = options[:auto_categorize]
|
43
49
|
@enable_threshold = options[:enable_threshold]
|
44
50
|
@threshold = options[:threshold]
|
45
51
|
@enable_stemmer = options[:enable_stemmer]
|
52
|
+
@backend = options[:backend]
|
53
|
+
|
54
|
+
populate_initial_categories
|
55
|
+
|
56
|
+
if options.key?(:stopwords)
|
57
|
+
custom_stopwords options[:stopwords]
|
58
|
+
end
|
46
59
|
end
|
47
60
|
|
48
61
|
# Provides a general training method for all categories specified in Bayes#new
|
@@ -52,10 +65,12 @@ module ClassifierReborn
|
|
52
65
|
# b.train "that", "That text"
|
53
66
|
# b.train "The other", "The other text"
|
54
67
|
def train(category, text)
|
68
|
+
word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
|
69
|
+
return if word_hash.empty?
|
55
70
|
category = CategoryNamer.prepare_name(category)
|
56
71
|
|
57
72
|
# Add the category dynamically or raise an error
|
58
|
-
unless
|
73
|
+
unless category_keys.include?(category)
|
59
74
|
if @auto_categorize
|
60
75
|
add_category(category)
|
61
76
|
else
|
@@ -63,12 +78,13 @@ module ClassifierReborn
|
|
63
78
|
end
|
64
79
|
end
|
65
80
|
|
66
|
-
|
67
|
-
|
68
|
-
@
|
69
|
-
@
|
70
|
-
@total_words += count
|
81
|
+
word_hash.each do |word, count|
|
82
|
+
@backend.update_category_word_frequency(category, word, count)
|
83
|
+
@backend.update_category_word_count(category, count)
|
84
|
+
@backend.update_total_words(count)
|
71
85
|
end
|
86
|
+
@backend.update_total_trainings(1)
|
87
|
+
@backend.update_category_training_count(category, 1)
|
72
88
|
end
|
73
89
|
|
74
90
|
# Provides a untraining method for all categories specified in Bayes#new
|
@@ -79,20 +95,23 @@ module ClassifierReborn
|
|
79
95
|
# b.train :this, "This text"
|
80
96
|
# b.untrain :this, "This text"
|
81
97
|
def untrain(category, text)
|
98
|
+
word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
|
99
|
+
return if word_hash.empty?
|
82
100
|
category = CategoryNamer.prepare_name(category)
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
@
|
88
|
-
|
89
|
-
@categories[category].delete(word)
|
101
|
+
word_hash.each do |word, count|
|
102
|
+
next if @backend.total_words < 0
|
103
|
+
orig = @backend.category_word_frequency(category, word) || 0
|
104
|
+
@backend.update_category_word_frequency(category, word, -count)
|
105
|
+
if @backend.category_word_frequency(category, word) <= 0
|
106
|
+
@backend.delete_category_word(category, word)
|
90
107
|
count = orig
|
91
108
|
end
|
92
109
|
|
93
|
-
@
|
94
|
-
@
|
110
|
+
@backend.update_category_word_count(category, -count) if @backend.category_word_count(category) >= count
|
111
|
+
@backend.update_total_words(-count)
|
95
112
|
end
|
113
|
+
@backend.update_total_trainings(-1)
|
114
|
+
@backend.update_category_training_count(category, -1)
|
96
115
|
end
|
97
116
|
|
98
117
|
# Returns the scores in each category the provided +text+. E.g.,
|
@@ -102,17 +121,22 @@ module ClassifierReborn
|
|
102
121
|
def classifications(text)
|
103
122
|
score = {}
|
104
123
|
word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
|
105
|
-
|
106
|
-
|
124
|
+
if word_hash.empty?
|
125
|
+
category_keys.each do |category|
|
126
|
+
score[category.to_s] = Float::INFINITY
|
127
|
+
end
|
128
|
+
return score
|
129
|
+
end
|
130
|
+
category_keys.each do |category|
|
107
131
|
score[category.to_s] = 0
|
108
|
-
total = (@category_word_count
|
132
|
+
total = (@backend.category_word_count(category) || 1).to_f
|
109
133
|
word_hash.each do |word, _count|
|
110
|
-
s =
|
134
|
+
s = @backend.word_in_category?(category, word) ? @backend.category_word_frequency(category, word) : 0.1
|
111
135
|
score[category.to_s] += Math.log(s / total)
|
112
136
|
end
|
113
137
|
# now add prior probability for the category
|
114
|
-
s = @
|
115
|
-
score[category.to_s] += Math.log(s /
|
138
|
+
s = @backend.category_has_trainings?(category) ? @backend.category_training_count(category) : 0.1
|
139
|
+
score[category.to_s] += Math.log(s / @backend.total_trainings.to_f)
|
116
140
|
end
|
117
141
|
score
|
118
142
|
end
|
@@ -178,7 +202,7 @@ module ClassifierReborn
|
|
178
202
|
def method_missing(name, *args)
|
179
203
|
cleaned_name = name.to_s.gsub(/(un)?train_([\w]+)/, '\2')
|
180
204
|
category = CategoryNamer.prepare_name(cleaned_name)
|
181
|
-
if
|
205
|
+
if category_keys.include?(category)
|
182
206
|
args.each { |text| eval("#{Regexp.last_match(1)}train(category, text)") }
|
183
207
|
elsif name.to_s =~ /(un)?train_([\w]+)/
|
184
208
|
raise StandardError, "No such category: #{category}"
|
@@ -190,9 +214,17 @@ module ClassifierReborn
|
|
190
214
|
# Provides a list of category names
|
191
215
|
# For example:
|
192
216
|
# b.categories
|
193
|
-
# => [
|
194
|
-
def categories
|
195
|
-
|
217
|
+
# => ["This", "That", "The other"]
|
218
|
+
def categories
|
219
|
+
category_keys.collect(&:to_s)
|
220
|
+
end
|
221
|
+
|
222
|
+
# Provides a list of category keys as symbols
|
223
|
+
# For example:
|
224
|
+
# b.categories
|
225
|
+
# => [:This, :That, :"The other"]
|
226
|
+
def category_keys
|
227
|
+
@backend.category_keys
|
196
228
|
end
|
197
229
|
|
198
230
|
# Allows you to add categories to the classifier.
|
@@ -204,9 +236,37 @@ module ClassifierReborn
|
|
204
236
|
# more criteria than the trained selective categories. In short,
|
205
237
|
# try to initialize your categories at initialization.
|
206
238
|
def add_category(category)
|
207
|
-
|
239
|
+
category = CategoryNamer.prepare_name(category)
|
240
|
+
@backend.add_category(category)
|
208
241
|
end
|
209
242
|
|
210
243
|
alias_method :append_category, :add_category
|
244
|
+
|
245
|
+
def reset
|
246
|
+
@backend.reset
|
247
|
+
populate_initial_categories
|
248
|
+
end
|
249
|
+
|
250
|
+
private
|
251
|
+
|
252
|
+
def populate_initial_categories
|
253
|
+
@initial_categories.each do |c|
|
254
|
+
add_category(c)
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
# Overwrites the default stopwords for current language with supplied list of stopwords or file
|
259
|
+
def custom_stopwords(stopwords)
|
260
|
+
unless stopwords.is_a?(Enumerable)
|
261
|
+
if stopwords.strip.empty?
|
262
|
+
stopwords = []
|
263
|
+
elsif File.exist?(stopwords)
|
264
|
+
stopwords = File.read(stopwords).force_encoding("utf-8").split
|
265
|
+
else
|
266
|
+
return # Do not overwrite the default
|
267
|
+
end
|
268
|
+
end
|
269
|
+
Hasher::STOPWORDS[@language] = Set.new stopwords
|
270
|
+
end
|
211
271
|
end
|
212
272
|
end
|