stuff-classifier-chinese 0.51
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +6 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +20 -0
- data/README.md +162 -0
- data/Rakefile +12 -0
- data/lib/stuff-classifier.rb +17 -0
- data/lib/stuff-classifier/base.rb +190 -0
- data/lib/stuff-classifier/bayes.rb +81 -0
- data/lib/stuff-classifier/storage.rb +122 -0
- data/lib/stuff-classifier/tf-idf.rb +45 -0
- data/lib/stuff-classifier/tokenizer.rb +96 -0
- data/lib/stuff-classifier/tokenizer/tokenizer_properties.rb +81 -0
- data/lib/stuff-classifier/version.rb +4 -0
- data/stuff-classifier.gemspec +36 -0
- data/test/helper.rb +50 -0
- data/test/test_001_tokenizer.rb +51 -0
- data/test/test_002_base.rb +39 -0
- data/test/test_003_naive_bayes.rb +57 -0
- data/test/test_004_tf_idf.rb +38 -0
- data/test/test_005_in_memory_storage.rb +32 -0
- data/test/test_006_file_storage.rb +78 -0
- data/test/test_007_redis_storage.rb +82 -0
- metadata +253 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Alexandru Nedelcu
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
# stuff-classifier
|
2
|
+
|
3
|
+
A library for classifying text into multiple categories.
|
4
|
+
|
5
|
+
Currently provided classifiers:
|
6
|
+
|
7
|
+
- a [naive bayes classifier](http://en.wikipedia.org/wiki/Naive_Bayes_classifier)
|
8
|
+
- a classifier based on [tf-idf weights](http://en.wikipedia.org/wiki/Tf%E2%80%93idf)
|
9
|
+
|
10
|
+
Ran a benchmark of 1345 items that I have previously manually
|
11
|
+
classified with multiple categories. Here's the rate over which the 2
|
12
|
+
algorithms have correctly detected one of those categories:
|
13
|
+
|
14
|
+
- Bayes: 79.26%
|
15
|
+
- Tf-Idf: 81.34%
|
16
|
+
|
17
|
+
I prefer the Naive Bayes approach, because while having lower stats on
|
18
|
+
this benchmark, it seems to make better decisions than I did in many
|
19
|
+
cases. For example, an item with title *"Paintball Session, 100 Balls
|
20
|
+
and Equipment"* was classified as *"Activities"* by me, but the bayes
|
21
|
+
classifier identified it as *"Sports"*, at which point I had an
|
22
|
+
intellectual orgasm. Also, the Tf-Idf classifier seems to do better on
|
23
|
+
clear-cut cases, but doesn't seem to handle uncertainty so well. Of
|
24
|
+
course, these are just quick tests I made and I have no idea which is
|
25
|
+
really better.
|
26
|
+
|
27
|
+
## Install
|
28
|
+
|
29
|
+
```bash
|
30
|
+
gem install stuff-classifier
|
31
|
+
```
|
32
|
+
|
33
|
+
## Usage
|
34
|
+
|
35
|
+
You either instantiate one class or the other. Both have the same
|
36
|
+
signature:
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
require 'stuff-classifier'
|
40
|
+
|
41
|
+
# for the naive bayes implementation
|
42
|
+
cls = StuffClassifier::Bayes.new("Cats or Dogs")
|
43
|
+
|
44
|
+
# for the Tf-Idf based implementation
|
45
|
+
cls = StuffClassifier::TfIdf.new("Cats or Dogs")
|
46
|
+
|
47
|
+
# these classifiers use word stemming by default, but if it has weird
|
48
|
+
# behavior, then you can disable it on init:
|
49
|
+
cls = StuffClassifier::TfIdf.new("Cats or Dogs", :stemming => false)
|
50
|
+
|
51
|
+
# also by default, the parsing phase filters out stop words, to
|
52
|
+
# disable or to come up with your own list of stop words, on a
|
53
|
+
# classifier instance you can do this:
|
54
|
+
cls.ignore_words = [ 'the', 'my', 'i', 'dont' ]
|
55
|
+
```
|
56
|
+
|
57
|
+
Training the classifier:
|
58
|
+
|
59
|
+
```ruby
|
60
|
+
cls.train(:dog, "Dogs are awesome, cats too. I love my dog")
|
61
|
+
cls.train(:cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog")
|
62
|
+
cls.train(:dog, "My dog's name is Willy. He likes to play with my wife's cat all day long. I love dogs")
|
63
|
+
cls.train(:cat, "Cats are difficult animals, unlike dogs, really annoying, I hate them all")
|
64
|
+
cls.train(:dog, "So which one should you choose? A dog, definitely.")
|
65
|
+
cls.train(:cat, "The favorite food for cats is bird meat, although mice are good, but birds are a delicacy")
|
66
|
+
cls.train(:dog, "A dog will eat anything, including birds or whatever meat")
|
67
|
+
cls.train(:cat, "My cat's favorite place to purr is on my keyboard")
|
68
|
+
cls.train(:dog, "My dog's favorite place to take a leak is the tree in front of our house")
|
69
|
+
```
|
70
|
+
|
71
|
+
And finally, classifying stuff:
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
cls.classify("This test is about cats.")
|
75
|
+
#=> :cat
|
76
|
+
cls.classify("I hate ...")
|
77
|
+
#=> :cat
|
78
|
+
cls.classify("The most annoying animal on earth.")
|
79
|
+
#=> :cat
|
80
|
+
cls.classify("The preferred company of software developers.")
|
81
|
+
#=> :cat
|
82
|
+
cls.classify("My precious, my favorite!")
|
83
|
+
#=> :cat
|
84
|
+
cls.classify("Get off my keyboard!")
|
85
|
+
#=> :cat
|
86
|
+
cls.classify("Kill that bird!")
|
87
|
+
#=> :cat
|
88
|
+
|
89
|
+
cls.classify("This test is about dogs.")
|
90
|
+
#=> :dog
|
91
|
+
cls.classify("Cats or Dogs?")
|
92
|
+
#=> :dog
|
93
|
+
cls.classify("What pet will I love more?")
|
94
|
+
#=> :dog
|
95
|
+
cls.classify("Willy, where the heck are you?")
|
96
|
+
#=> :dog
|
97
|
+
cls.classify("I like big buts and I cannot lie.")
|
98
|
+
#=> :dog
|
99
|
+
cls.classify("Why is the front door of our house open?")
|
100
|
+
#=> :dog
|
101
|
+
cls.classify("Who is eating my meat?")
|
102
|
+
#=> :dog
|
103
|
+
```
|
104
|
+
|
105
|
+
## Persistency
|
106
|
+
|
107
|
+
The following layers for saving the training data between sessions are
|
108
|
+
implemented:
|
109
|
+
|
110
|
+
- in memory (by default)
|
111
|
+
- on disk
|
112
|
+
- Redis
|
113
|
+
- (coming soon) in a RDBMS
|
114
|
+
|
115
|
+
To persist the data in Redis, you can do this:
|
116
|
+
```ruby
|
117
|
+
# defaults to redis running on localhost on default port
|
118
|
+
store = StuffClassifier::RedisStorage.new(@key)
|
119
|
+
|
120
|
+
# pass in connection args
|
121
|
+
store = StuffClassifier::RedisStorage.new(@key, {host:'my.redis.server.com', port: 4829})
|
122
|
+
```
|
123
|
+
|
124
|
+
To persist the data on disk, you can do this:
|
125
|
+
|
126
|
+
```ruby
|
127
|
+
store = StuffClassifier::FileStorage.new(@storage_path)
|
128
|
+
|
129
|
+
# global setting
|
130
|
+
StuffClassifier::Base.storage = store
|
131
|
+
|
132
|
+
# or alternative local setting on instantiation, by means of an
|
133
|
+
# optional param ...
|
134
|
+
cls = StuffClassifier::Bayes.new("Cats or Dogs", :storage => store)
|
135
|
+
|
136
|
+
# after training is done, to persist the data ...
|
137
|
+
cls.save_state
|
138
|
+
|
139
|
+
# or you could just do this:
|
140
|
+
StuffClassifier::Bayes.open("Cats or Dogs") do |cls|
|
141
|
+
# when done, save_state is called on END
|
142
|
+
end
|
143
|
+
|
144
|
+
# to start fresh, deleting the saved training data for this classifier
|
145
|
+
StuffClassifier::Bayes.new("Cats or Dogs", :purge_state => true)
|
146
|
+
```
|
147
|
+
|
148
|
+
The name you give your classifier is important, as based on it the
|
149
|
+
data will get loaded and saved. For instance, following 3 classifiers
|
150
|
+
will be stored in different buckets, being independent of each other.
|
151
|
+
|
152
|
+
```ruby
|
153
|
+
cls1 = StuffClassifier::Bayes.new("Cats or Dogs")
|
154
|
+
cls2 = StuffClassifier::Bayes.new("True or False")
|
155
|
+
cls3 = StuffClassifier::Bayes.new("Spam or Ham")
|
156
|
+
```
|
157
|
+
|
158
|
+
## License
|
159
|
+
|
160
|
+
MIT Licensed. See LICENSE.txt for details.
|
161
|
+
|
162
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
module StuffClassifier
|
3
|
+
autoload :VERSION, 'stuff-classifier/version'
|
4
|
+
|
5
|
+
autoload :Storage, 'stuff-classifier/storage'
|
6
|
+
autoload :InMemoryStorage, 'stuff-classifier/storage'
|
7
|
+
autoload :FileStorage, 'stuff-classifier/storage'
|
8
|
+
autoload :RedisStorage, 'stuff-classifier/storage'
|
9
|
+
|
10
|
+
autoload :Tokenizer, 'stuff-classifier/tokenizer'
|
11
|
+
autoload :TOKENIZER_PROPERTIES, 'stuff-classifier/tokenizer/tokenizer_properties'
|
12
|
+
|
13
|
+
autoload :Base, 'stuff-classifier/base'
|
14
|
+
autoload :Bayes, 'stuff-classifier/bayes'
|
15
|
+
autoload :TfIdf, 'stuff-classifier/tf-idf'
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,190 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
class StuffClassifier::Base
|
4
|
+
extend StuffClassifier::Storage::ActAsStorable
|
5
|
+
attr_reader :name
|
6
|
+
attr_reader :word_list
|
7
|
+
attr_reader :category_list
|
8
|
+
attr_reader :training_count
|
9
|
+
|
10
|
+
attr_accessor :tokenizer
|
11
|
+
attr_accessor :language
|
12
|
+
|
13
|
+
attr_accessor :thresholds
|
14
|
+
attr_accessor :min_prob
|
15
|
+
|
16
|
+
|
17
|
+
storable :version,:word_list,:category_list,:training_count,:thresholds,:min_prob
|
18
|
+
|
19
|
+
# opts :
|
20
|
+
# language
|
21
|
+
# stemming : true | false
|
22
|
+
# weight
|
23
|
+
# assumed_prob
|
24
|
+
# storage
|
25
|
+
# purge_state ?
|
26
|
+
|
27
|
+
def initialize(name, opts={})
|
28
|
+
@version = StuffClassifier::VERSION
|
29
|
+
|
30
|
+
@name = name
|
31
|
+
|
32
|
+
# This values are nil or are loaded from storage
|
33
|
+
@word_list = {}
|
34
|
+
@category_list = {}
|
35
|
+
@training_count=0
|
36
|
+
|
37
|
+
# storage
|
38
|
+
purge_state = opts[:purge_state]
|
39
|
+
@storage = opts[:storage] || StuffClassifier::Base.storage
|
40
|
+
unless purge_state
|
41
|
+
@storage.load_state(self)
|
42
|
+
else
|
43
|
+
@storage.purge_state(self)
|
44
|
+
end
|
45
|
+
|
46
|
+
# This value can be set during initialization or overrided after load_state
|
47
|
+
@thresholds = opts[:thresholds] || {}
|
48
|
+
@min_prob = opts[:min_prob] || 0.0
|
49
|
+
|
50
|
+
|
51
|
+
@ignore_words = nil
|
52
|
+
@tokenizer = StuffClassifier::Tokenizer.new(opts)
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
def incr_word(word, category)
|
57
|
+
@word_list[word] ||= {}
|
58
|
+
|
59
|
+
@word_list[word][:categories] ||= {}
|
60
|
+
@word_list[word][:categories][category] ||= 0
|
61
|
+
@word_list[word][:categories][category] += 1
|
62
|
+
|
63
|
+
@word_list[word][:_total_word] ||= 0
|
64
|
+
@word_list[word][:_total_word] += 1
|
65
|
+
|
66
|
+
|
67
|
+
# words count by categroy
|
68
|
+
@category_list[category] ||= {}
|
69
|
+
@category_list[category][:_total_word] ||= 0
|
70
|
+
@category_list[category][:_total_word] += 1
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
def incr_cat(category)
|
75
|
+
@category_list[category] ||= {}
|
76
|
+
@category_list[category][:_count] ||= 0
|
77
|
+
@category_list[category][:_count] += 1
|
78
|
+
|
79
|
+
@training_count ||= 0
|
80
|
+
@training_count += 1
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
# return number of times the word appears in a category
|
85
|
+
def word_count(word, category)
|
86
|
+
return 0.0 unless @word_list[word] && @word_list[word][:categories] && @word_list[word][:categories][category]
|
87
|
+
@word_list[word][:categories][category].to_f
|
88
|
+
end
|
89
|
+
|
90
|
+
# return the number of times the word appears in all categories
|
91
|
+
def total_word_count(word)
|
92
|
+
return 0.0 unless @word_list[word] && @word_list[word][:_total_word]
|
93
|
+
@word_list[word][:_total_word].to_f
|
94
|
+
end
|
95
|
+
|
96
|
+
# return the number of words in a categories
|
97
|
+
def total_word_count_in_cat(cat)
|
98
|
+
return 0.0 unless @category_list[cat] && @category_list[cat][:_total_word]
|
99
|
+
@category_list[cat][:_total_word].to_f
|
100
|
+
end
|
101
|
+
|
102
|
+
# return the number of training item
|
103
|
+
def total_cat_count
|
104
|
+
@training_count
|
105
|
+
end
|
106
|
+
|
107
|
+
# return the number of training document for a category
|
108
|
+
def cat_count(category)
|
109
|
+
@category_list[category][:_count] ? @category_list[category][:_count].to_f : 0.0
|
110
|
+
end
|
111
|
+
|
112
|
+
# return the number of time categories in wich a word appear
|
113
|
+
def categories_with_word_count(word)
|
114
|
+
return 0 unless @word_list[word] && @word_list[word][:categories]
|
115
|
+
@word_list[word][:categories].length
|
116
|
+
end
|
117
|
+
|
118
|
+
# return the number of categories
|
119
|
+
def total_categories
|
120
|
+
categories.length
|
121
|
+
end
|
122
|
+
|
123
|
+
# return categories list
|
124
|
+
def categories
|
125
|
+
@category_list.keys
|
126
|
+
end
|
127
|
+
|
128
|
+
# train the classifier
|
129
|
+
def train(category, text)
|
130
|
+
@tokenizer.each_word(text) {|w| incr_word(w, category) }
|
131
|
+
incr_cat(category)
|
132
|
+
end
|
133
|
+
|
134
|
+
# classify a text
|
135
|
+
def classify(text, default=nil)
|
136
|
+
# Find the category with the highest probability
|
137
|
+
max_prob = @min_prob
|
138
|
+
best = nil
|
139
|
+
|
140
|
+
scores = cat_scores(text)
|
141
|
+
scores.each do |score|
|
142
|
+
cat, prob = score
|
143
|
+
if prob > max_prob
|
144
|
+
max_prob = prob
|
145
|
+
best = cat
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Return the default category in case the threshold condition was
|
150
|
+
# not met. For example, if the threshold for :spam is 1.2
|
151
|
+
#
|
152
|
+
# :spam => 0.73, :ham => 0.40 (OK)
|
153
|
+
# :spam => 0.80, :ham => 0.70 (Fail, :ham is too close)
|
154
|
+
|
155
|
+
return default unless best
|
156
|
+
|
157
|
+
threshold = @thresholds[best] || 1.0
|
158
|
+
|
159
|
+
scores.each do |score|
|
160
|
+
cat, prob = score
|
161
|
+
next if cat == best
|
162
|
+
return default if prob * threshold > max_prob
|
163
|
+
end
|
164
|
+
|
165
|
+
return best
|
166
|
+
end
|
167
|
+
|
168
|
+
def save_state
|
169
|
+
@storage.save_state(self)
|
170
|
+
end
|
171
|
+
|
172
|
+
class << self
|
173
|
+
attr_writer :storage
|
174
|
+
|
175
|
+
def storage
|
176
|
+
@storage = StuffClassifier::InMemoryStorage.new unless defined? @storage
|
177
|
+
@storage
|
178
|
+
end
|
179
|
+
|
180
|
+
def open(name)
|
181
|
+
inst = self.new(name)
|
182
|
+
if block_given?
|
183
|
+
yield inst
|
184
|
+
inst.save_state
|
185
|
+
else
|
186
|
+
inst
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
class StuffClassifier::Bayes < StuffClassifier::Base
|
4
|
+
attr_accessor :weight
|
5
|
+
attr_accessor :assumed_prob
|
6
|
+
|
7
|
+
|
8
|
+
# http://en.wikipedia.org/wiki/Naive_Bayes_classifier
|
9
|
+
extend StuffClassifier::Storage::ActAsStorable
|
10
|
+
storable :weight,:assumed_prob
|
11
|
+
|
12
|
+
def initialize(name, opts={})
|
13
|
+
super(name, opts)
|
14
|
+
@weight = opts[:weight] || 1.0
|
15
|
+
@assumed_prob = opts[:assumed_prob] || 0.1
|
16
|
+
end
|
17
|
+
|
18
|
+
def word_prob(word, cat)
|
19
|
+
total_words_in_cat = total_word_count_in_cat(cat)
|
20
|
+
return 0.0 if total_words_in_cat == 0
|
21
|
+
word_count(word, cat).to_f / total_words_in_cat
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
def word_weighted_average(word, cat, opts={})
|
26
|
+
func = opts[:func]
|
27
|
+
|
28
|
+
# calculate current probability
|
29
|
+
basic_prob = func ? func.call(word, cat) : word_prob(word, cat)
|
30
|
+
|
31
|
+
# count the number of times this word has appeared in all
|
32
|
+
# categories
|
33
|
+
totals = total_word_count(word)
|
34
|
+
|
35
|
+
# the final weighted average
|
36
|
+
(@weight * @assumed_prob + totals * basic_prob) / (@weight + totals)
|
37
|
+
end
|
38
|
+
|
39
|
+
def doc_prob(text, category)
|
40
|
+
@tokenizer.each_word(text).map {|w|
|
41
|
+
word_weighted_average(w, category)
|
42
|
+
}.inject(1) {|p,c| p * c}
|
43
|
+
end
|
44
|
+
|
45
|
+
def text_prob(text, category)
|
46
|
+
cat_prob = cat_count(category) / total_cat_count
|
47
|
+
doc_prob = doc_prob(text, category)
|
48
|
+
cat_prob * doc_prob
|
49
|
+
end
|
50
|
+
|
51
|
+
def cat_scores(text)
|
52
|
+
probs = {}
|
53
|
+
categories.each do |cat|
|
54
|
+
probs[cat] = text_prob(text, cat)
|
55
|
+
end
|
56
|
+
probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
def word_classification_detail(word)
|
61
|
+
|
62
|
+
p "word_prob"
|
63
|
+
result=self.categories.inject({}) do |h,cat| h[cat]=self.word_prob(word,cat);h end
|
64
|
+
p result
|
65
|
+
|
66
|
+
p "word_weighted_average"
|
67
|
+
result=categories.inject({}) do |h,cat| h[cat]=word_weighted_average(word,cat);h end
|
68
|
+
p result
|
69
|
+
|
70
|
+
p "doc_prob"
|
71
|
+
result=categories.inject({}) do |h,cat| h[cat]=doc_prob(word,cat);h end
|
72
|
+
p result
|
73
|
+
|
74
|
+
p "text_prob"
|
75
|
+
result=categories.inject({}) do |h,cat| h[cat]=text_prob(word,cat);h end
|
76
|
+
p result
|
77
|
+
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|