stuff-classifier-chinese 0.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +6 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +20 -0
- data/README.md +162 -0
- data/Rakefile +12 -0
- data/lib/stuff-classifier.rb +17 -0
- data/lib/stuff-classifier/base.rb +190 -0
- data/lib/stuff-classifier/bayes.rb +81 -0
- data/lib/stuff-classifier/storage.rb +122 -0
- data/lib/stuff-classifier/tf-idf.rb +45 -0
- data/lib/stuff-classifier/tokenizer.rb +96 -0
- data/lib/stuff-classifier/tokenizer/tokenizer_properties.rb +81 -0
- data/lib/stuff-classifier/version.rb +4 -0
- data/stuff-classifier.gemspec +36 -0
- data/test/helper.rb +50 -0
- data/test/test_001_tokenizer.rb +51 -0
- data/test/test_002_base.rb +39 -0
- data/test/test_003_naive_bayes.rb +57 -0
- data/test/test_004_tf_idf.rb +38 -0
- data/test/test_005_in_memory_storage.rb +32 -0
- data/test/test_006_file_storage.rb +78 -0
- data/test/test_007_redis_storage.rb +82 -0
- metadata +253 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Alexandru Nedelcu
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
# stuff-classifier
|
2
|
+
|
3
|
+
A library for classifying text into multiple categories.
|
4
|
+
|
5
|
+
Currently provided classifiers:
|
6
|
+
|
7
|
+
- a [naive bayes classifier](http://en.wikipedia.org/wiki/Naive_Bayes_classifier)
|
8
|
+
- a classifier based on [tf-idf weights](http://en.wikipedia.org/wiki/Tf%E2%80%93idf)
|
9
|
+
|
10
|
+
Ran a benchmark of 1345 items that I have previously manually
|
11
|
+
classified with multiple categories. Here's the rate over which the 2
|
12
|
+
algorithms have correctly detected one of those categories:
|
13
|
+
|
14
|
+
- Bayes: 79.26%
|
15
|
+
- Tf-Idf: 81.34%
|
16
|
+
|
17
|
+
I prefer the Naive Bayes approach, because while having lower stats on
|
18
|
+
this benchmark, it seems to make better decisions than I did in many
|
19
|
+
cases. For example, an item with title *"Paintball Session, 100 Balls
|
20
|
+
and Equipment"* was classified as *"Activities"* by me, but the bayes
|
21
|
+
classifier identified it as *"Sports"*, at which point I had an
|
22
|
+
intellectual orgasm. Also, the Tf-Idf classifier seems to do better on
|
23
|
+
clear-cut cases, but doesn't seem to handle uncertainty so well. Of
|
24
|
+
course, these are just quick tests I made and I have no idea which is
|
25
|
+
really better.
|
26
|
+
|
27
|
+
## Install
|
28
|
+
|
29
|
+
```bash
|
30
|
+
gem install stuff-classifier
|
31
|
+
```
|
32
|
+
|
33
|
+
## Usage
|
34
|
+
|
35
|
+
You either instantiate one class or the other. Both have the same
|
36
|
+
signature:
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
require 'stuff-classifier'
|
40
|
+
|
41
|
+
# for the naive bayes implementation
|
42
|
+
cls = StuffClassifier::Bayes.new("Cats or Dogs")
|
43
|
+
|
44
|
+
# for the Tf-Idf based implementation
|
45
|
+
cls = StuffClassifier::TfIdf.new("Cats or Dogs")
|
46
|
+
|
47
|
+
# these classifiers use word stemming by default, but if it has weird
|
48
|
+
# behavior, then you can disable it on init:
|
49
|
+
cls = StuffClassifier::TfIdf.new("Cats or Dogs", :stemming => false)
|
50
|
+
|
51
|
+
# also by default, the parsing phase filters out stop words, to
|
52
|
+
# disable or to come up with your own list of stop words, on a
|
53
|
+
# classifier instance you can do this:
|
54
|
+
cls.ignore_words = [ 'the', 'my', 'i', 'dont' ]
|
55
|
+
```
|
56
|
+
|
57
|
+
Training the classifier:
|
58
|
+
|
59
|
+
```ruby
|
60
|
+
cls.train(:dog, "Dogs are awesome, cats too. I love my dog")
|
61
|
+
cls.train(:cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog")
|
62
|
+
cls.train(:dog, "My dog's name is Willy. He likes to play with my wife's cat all day long. I love dogs")
|
63
|
+
cls.train(:cat, "Cats are difficult animals, unlike dogs, really annoying, I hate them all")
|
64
|
+
cls.train(:dog, "So which one should you choose? A dog, definitely.")
|
65
|
+
cls.train(:cat, "The favorite food for cats is bird meat, although mice are good, but birds are a delicacy")
|
66
|
+
cls.train(:dog, "A dog will eat anything, including birds or whatever meat")
|
67
|
+
cls.train(:cat, "My cat's favorite place to purr is on my keyboard")
|
68
|
+
cls.train(:dog, "My dog's favorite place to take a leak is the tree in front of our house")
|
69
|
+
```
|
70
|
+
|
71
|
+
And finally, classifying stuff:
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
cls.classify("This test is about cats.")
|
75
|
+
#=> :cat
|
76
|
+
cls.classify("I hate ...")
|
77
|
+
#=> :cat
|
78
|
+
cls.classify("The most annoying animal on earth.")
|
79
|
+
#=> :cat
|
80
|
+
cls.classify("The preferred company of software developers.")
|
81
|
+
#=> :cat
|
82
|
+
cls.classify("My precious, my favorite!")
|
83
|
+
#=> :cat
|
84
|
+
cls.classify("Get off my keyboard!")
|
85
|
+
#=> :cat
|
86
|
+
cls.classify("Kill that bird!")
|
87
|
+
#=> :cat
|
88
|
+
|
89
|
+
cls.classify("This test is about dogs.")
|
90
|
+
#=> :dog
|
91
|
+
cls.classify("Cats or Dogs?")
|
92
|
+
#=> :dog
|
93
|
+
cls.classify("What pet will I love more?")
|
94
|
+
#=> :dog
|
95
|
+
cls.classify("Willy, where the heck are you?")
|
96
|
+
#=> :dog
|
97
|
+
cls.classify("I like big buts and I cannot lie.")
|
98
|
+
#=> :dog
|
99
|
+
cls.classify("Why is the front door of our house open?")
|
100
|
+
#=> :dog
|
101
|
+
cls.classify("Who is eating my meat?")
|
102
|
+
#=> :dog
|
103
|
+
```
|
104
|
+
|
105
|
+
## Persistency
|
106
|
+
|
107
|
+
The following layers for saving the training data between sessions are
|
108
|
+
implemented:
|
109
|
+
|
110
|
+
- in memory (by default)
|
111
|
+
- on disk
|
112
|
+
- Redis
|
113
|
+
- (coming soon) in a RDBMS
|
114
|
+
|
115
|
+
To persist the data in Redis, you can do this:
|
116
|
+
```ruby
|
117
|
+
# defaults to redis running on localhost on default port
|
118
|
+
store = StuffClassifier::RedisStorage.new(@key)
|
119
|
+
|
120
|
+
# pass in connection args
|
121
|
+
store = StuffClassifier::RedisStorage.new(@key, {host:'my.redis.server.com', port: 4829})
|
122
|
+
```
|
123
|
+
|
124
|
+
To persist the data on disk, you can do this:
|
125
|
+
|
126
|
+
```ruby
|
127
|
+
store = StuffClassifier::FileStorage.new(@storage_path)
|
128
|
+
|
129
|
+
# global setting
|
130
|
+
StuffClassifier::Base.storage = store
|
131
|
+
|
132
|
+
# or alternative local setting on instantiation, by means of an
|
133
|
+
# optional param ...
|
134
|
+
cls = StuffClassifier::Bayes.new("Cats or Dogs", :storage => store)
|
135
|
+
|
136
|
+
# after training is done, to persist the data ...
|
137
|
+
cls.save_state
|
138
|
+
|
139
|
+
# or you could just do this:
|
140
|
+
StuffClassifier::Bayes.open("Cats or Dogs") do |cls|
|
141
|
+
# when done, save_state is called on END
|
142
|
+
end
|
143
|
+
|
144
|
+
# to start fresh, deleting the saved training data for this classifier
|
145
|
+
StuffClassifier::Bayes.new("Cats or Dogs", :purge_state => true)
|
146
|
+
```
|
147
|
+
|
148
|
+
The name you give your classifier is important, as based on it the
|
149
|
+
data will get loaded and saved. For instance, following 3 classifiers
|
150
|
+
will be stored in different buckets, being independent of each other.
|
151
|
+
|
152
|
+
```ruby
|
153
|
+
cls1 = StuffClassifier::Bayes.new("Cats or Dogs")
|
154
|
+
cls2 = StuffClassifier::Bayes.new("True or False")
|
155
|
+
cls3 = StuffClassifier::Bayes.new("Spam or Ham")
|
156
|
+
```
|
157
|
+
|
158
|
+
## License
|
159
|
+
|
160
|
+
MIT Licensed. See LICENSE.txt for details.
|
161
|
+
|
162
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
module StuffClassifier
|
3
|
+
autoload :VERSION, 'stuff-classifier/version'
|
4
|
+
|
5
|
+
autoload :Storage, 'stuff-classifier/storage'
|
6
|
+
autoload :InMemoryStorage, 'stuff-classifier/storage'
|
7
|
+
autoload :FileStorage, 'stuff-classifier/storage'
|
8
|
+
autoload :RedisStorage, 'stuff-classifier/storage'
|
9
|
+
|
10
|
+
autoload :Tokenizer, 'stuff-classifier/tokenizer'
|
11
|
+
autoload :TOKENIZER_PROPERTIES, 'stuff-classifier/tokenizer/tokenizer_properties'
|
12
|
+
|
13
|
+
autoload :Base, 'stuff-classifier/base'
|
14
|
+
autoload :Bayes, 'stuff-classifier/bayes'
|
15
|
+
autoload :TfIdf, 'stuff-classifier/tf-idf'
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,190 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
class StuffClassifier::Base
|
4
|
+
extend StuffClassifier::Storage::ActAsStorable
|
5
|
+
attr_reader :name
|
6
|
+
attr_reader :word_list
|
7
|
+
attr_reader :category_list
|
8
|
+
attr_reader :training_count
|
9
|
+
|
10
|
+
attr_accessor :tokenizer
|
11
|
+
attr_accessor :language
|
12
|
+
|
13
|
+
attr_accessor :thresholds
|
14
|
+
attr_accessor :min_prob
|
15
|
+
|
16
|
+
|
17
|
+
storable :version,:word_list,:category_list,:training_count,:thresholds,:min_prob
|
18
|
+
|
19
|
+
# opts :
|
20
|
+
# language
|
21
|
+
# stemming : true | false
|
22
|
+
# weight
|
23
|
+
# assumed_prob
|
24
|
+
# storage
|
25
|
+
# purge_state ?
|
26
|
+
|
27
|
+
def initialize(name, opts={})
|
28
|
+
@version = StuffClassifier::VERSION
|
29
|
+
|
30
|
+
@name = name
|
31
|
+
|
32
|
+
# This values are nil or are loaded from storage
|
33
|
+
@word_list = {}
|
34
|
+
@category_list = {}
|
35
|
+
@training_count=0
|
36
|
+
|
37
|
+
# storage
|
38
|
+
purge_state = opts[:purge_state]
|
39
|
+
@storage = opts[:storage] || StuffClassifier::Base.storage
|
40
|
+
unless purge_state
|
41
|
+
@storage.load_state(self)
|
42
|
+
else
|
43
|
+
@storage.purge_state(self)
|
44
|
+
end
|
45
|
+
|
46
|
+
# This value can be set during initialization or overrided after load_state
|
47
|
+
@thresholds = opts[:thresholds] || {}
|
48
|
+
@min_prob = opts[:min_prob] || 0.0
|
49
|
+
|
50
|
+
|
51
|
+
@ignore_words = nil
|
52
|
+
@tokenizer = StuffClassifier::Tokenizer.new(opts)
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
def incr_word(word, category)
|
57
|
+
@word_list[word] ||= {}
|
58
|
+
|
59
|
+
@word_list[word][:categories] ||= {}
|
60
|
+
@word_list[word][:categories][category] ||= 0
|
61
|
+
@word_list[word][:categories][category] += 1
|
62
|
+
|
63
|
+
@word_list[word][:_total_word] ||= 0
|
64
|
+
@word_list[word][:_total_word] += 1
|
65
|
+
|
66
|
+
|
67
|
+
# words count by categroy
|
68
|
+
@category_list[category] ||= {}
|
69
|
+
@category_list[category][:_total_word] ||= 0
|
70
|
+
@category_list[category][:_total_word] += 1
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
def incr_cat(category)
|
75
|
+
@category_list[category] ||= {}
|
76
|
+
@category_list[category][:_count] ||= 0
|
77
|
+
@category_list[category][:_count] += 1
|
78
|
+
|
79
|
+
@training_count ||= 0
|
80
|
+
@training_count += 1
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
# return number of times the word appears in a category
|
85
|
+
def word_count(word, category)
|
86
|
+
return 0.0 unless @word_list[word] && @word_list[word][:categories] && @word_list[word][:categories][category]
|
87
|
+
@word_list[word][:categories][category].to_f
|
88
|
+
end
|
89
|
+
|
90
|
+
# return the number of times the word appears in all categories
|
91
|
+
def total_word_count(word)
|
92
|
+
return 0.0 unless @word_list[word] && @word_list[word][:_total_word]
|
93
|
+
@word_list[word][:_total_word].to_f
|
94
|
+
end
|
95
|
+
|
96
|
+
# return the number of words in a categories
|
97
|
+
def total_word_count_in_cat(cat)
|
98
|
+
return 0.0 unless @category_list[cat] && @category_list[cat][:_total_word]
|
99
|
+
@category_list[cat][:_total_word].to_f
|
100
|
+
end
|
101
|
+
|
102
|
+
# return the number of training item
|
103
|
+
def total_cat_count
|
104
|
+
@training_count
|
105
|
+
end
|
106
|
+
|
107
|
+
# return the number of training document for a category
|
108
|
+
def cat_count(category)
|
109
|
+
@category_list[category][:_count] ? @category_list[category][:_count].to_f : 0.0
|
110
|
+
end
|
111
|
+
|
112
|
+
# return the number of time categories in wich a word appear
|
113
|
+
def categories_with_word_count(word)
|
114
|
+
return 0 unless @word_list[word] && @word_list[word][:categories]
|
115
|
+
@word_list[word][:categories].length
|
116
|
+
end
|
117
|
+
|
118
|
+
# return the number of categories
|
119
|
+
def total_categories
|
120
|
+
categories.length
|
121
|
+
end
|
122
|
+
|
123
|
+
# return categories list
|
124
|
+
def categories
|
125
|
+
@category_list.keys
|
126
|
+
end
|
127
|
+
|
128
|
+
# train the classifier
|
129
|
+
def train(category, text)
|
130
|
+
@tokenizer.each_word(text) {|w| incr_word(w, category) }
|
131
|
+
incr_cat(category)
|
132
|
+
end
|
133
|
+
|
134
|
+
# classify a text
|
135
|
+
def classify(text, default=nil)
|
136
|
+
# Find the category with the highest probability
|
137
|
+
max_prob = @min_prob
|
138
|
+
best = nil
|
139
|
+
|
140
|
+
scores = cat_scores(text)
|
141
|
+
scores.each do |score|
|
142
|
+
cat, prob = score
|
143
|
+
if prob > max_prob
|
144
|
+
max_prob = prob
|
145
|
+
best = cat
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Return the default category in case the threshold condition was
|
150
|
+
# not met. For example, if the threshold for :spam is 1.2
|
151
|
+
#
|
152
|
+
# :spam => 0.73, :ham => 0.40 (OK)
|
153
|
+
# :spam => 0.80, :ham => 0.70 (Fail, :ham is too close)
|
154
|
+
|
155
|
+
return default unless best
|
156
|
+
|
157
|
+
threshold = @thresholds[best] || 1.0
|
158
|
+
|
159
|
+
scores.each do |score|
|
160
|
+
cat, prob = score
|
161
|
+
next if cat == best
|
162
|
+
return default if prob * threshold > max_prob
|
163
|
+
end
|
164
|
+
|
165
|
+
return best
|
166
|
+
end
|
167
|
+
|
168
|
+
def save_state
|
169
|
+
@storage.save_state(self)
|
170
|
+
end
|
171
|
+
|
172
|
+
class << self
|
173
|
+
attr_writer :storage
|
174
|
+
|
175
|
+
def storage
|
176
|
+
@storage = StuffClassifier::InMemoryStorage.new unless defined? @storage
|
177
|
+
@storage
|
178
|
+
end
|
179
|
+
|
180
|
+
def open(name)
|
181
|
+
inst = self.new(name)
|
182
|
+
if block_given?
|
183
|
+
yield inst
|
184
|
+
inst.save_state
|
185
|
+
else
|
186
|
+
inst
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
class StuffClassifier::Bayes < StuffClassifier::Base
|
4
|
+
attr_accessor :weight
|
5
|
+
attr_accessor :assumed_prob
|
6
|
+
|
7
|
+
|
8
|
+
# http://en.wikipedia.org/wiki/Naive_Bayes_classifier
|
9
|
+
extend StuffClassifier::Storage::ActAsStorable
|
10
|
+
storable :weight,:assumed_prob
|
11
|
+
|
12
|
+
def initialize(name, opts={})
|
13
|
+
super(name, opts)
|
14
|
+
@weight = opts[:weight] || 1.0
|
15
|
+
@assumed_prob = opts[:assumed_prob] || 0.1
|
16
|
+
end
|
17
|
+
|
18
|
+
def word_prob(word, cat)
|
19
|
+
total_words_in_cat = total_word_count_in_cat(cat)
|
20
|
+
return 0.0 if total_words_in_cat == 0
|
21
|
+
word_count(word, cat).to_f / total_words_in_cat
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
def word_weighted_average(word, cat, opts={})
|
26
|
+
func = opts[:func]
|
27
|
+
|
28
|
+
# calculate current probability
|
29
|
+
basic_prob = func ? func.call(word, cat) : word_prob(word, cat)
|
30
|
+
|
31
|
+
# count the number of times this word has appeared in all
|
32
|
+
# categories
|
33
|
+
totals = total_word_count(word)
|
34
|
+
|
35
|
+
# the final weighted average
|
36
|
+
(@weight * @assumed_prob + totals * basic_prob) / (@weight + totals)
|
37
|
+
end
|
38
|
+
|
39
|
+
def doc_prob(text, category)
|
40
|
+
@tokenizer.each_word(text).map {|w|
|
41
|
+
word_weighted_average(w, category)
|
42
|
+
}.inject(1) {|p,c| p * c}
|
43
|
+
end
|
44
|
+
|
45
|
+
def text_prob(text, category)
|
46
|
+
cat_prob = cat_count(category) / total_cat_count
|
47
|
+
doc_prob = doc_prob(text, category)
|
48
|
+
cat_prob * doc_prob
|
49
|
+
end
|
50
|
+
|
51
|
+
def cat_scores(text)
|
52
|
+
probs = {}
|
53
|
+
categories.each do |cat|
|
54
|
+
probs[cat] = text_prob(text, cat)
|
55
|
+
end
|
56
|
+
probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
def word_classification_detail(word)
|
61
|
+
|
62
|
+
p "word_prob"
|
63
|
+
result=self.categories.inject({}) do |h,cat| h[cat]=self.word_prob(word,cat);h end
|
64
|
+
p result
|
65
|
+
|
66
|
+
p "word_weighted_average"
|
67
|
+
result=categories.inject({}) do |h,cat| h[cat]=word_weighted_average(word,cat);h end
|
68
|
+
p result
|
69
|
+
|
70
|
+
p "doc_prob"
|
71
|
+
result=categories.inject({}) do |h,cat| h[cat]=doc_prob(word,cat);h end
|
72
|
+
p result
|
73
|
+
|
74
|
+
p "text_prob"
|
75
|
+
result=categories.inject({}) do |h,cat| h[cat]=text_prob(word,cat);h end
|
76
|
+
p result
|
77
|
+
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|