stuff-classifier 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +48 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +46 -0
- data/LICENSE.txt +20 -0
- data/README.md +103 -0
- data/Rakefile +30 -0
- data/lib/stuff-classifier.rb +8 -0
- data/lib/stuff-classifier/base.rb +66 -0
- data/lib/stuff-classifier/bayes.rb +55 -0
- data/lib/stuff-classifier/stop_words.rb +55 -0
- data/lib/stuff-classifier/tf-idf.rb +41 -0
- data/lib/stuff-classifier/tokenizer.rb +48 -0
- data/lib/stuff-classifier/version.rb +3 -0
- data/stuff-classifier.gemspec +29 -0
- data/test/helper.rb +42 -0
- data/test/test_001_tokenizer.rb +39 -0
- data/test/test_002_naive_bayes.rb +37 -0
- data/test/test_003_tf_idf.rb +37 -0
- metadata +152 -0
data/.gitignore
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# rcov generated
|
2
|
+
coverage
|
3
|
+
|
4
|
+
# rdoc generated
|
5
|
+
rdoc
|
6
|
+
|
7
|
+
# yard generated
|
8
|
+
doc
|
9
|
+
.yardoc
|
10
|
+
|
11
|
+
# bundler
|
12
|
+
.bundle
|
13
|
+
|
14
|
+
# jeweler generated
|
15
|
+
pkg
|
16
|
+
|
17
|
+
# Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
|
18
|
+
#
|
19
|
+
# * Create a file at ~/.gitignore
|
20
|
+
# * Include files you want ignored
|
21
|
+
# * Run: git config --global core.excludesfile ~/.gitignore
|
22
|
+
#
|
23
|
+
# After doing this, these files will be ignored in all your git projects,
|
24
|
+
# saving you from having to 'pollute' every project you touch with them
|
25
|
+
#
|
26
|
+
# Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
|
27
|
+
#
|
28
|
+
# For MacOS:
|
29
|
+
#
|
30
|
+
#.DS_Store
|
31
|
+
|
32
|
+
# For TextMate
|
33
|
+
#*.tmproj
|
34
|
+
#tmtags
|
35
|
+
|
36
|
+
# For emacs:
|
37
|
+
#*~
|
38
|
+
#\#*
|
39
|
+
#.\#*
|
40
|
+
|
41
|
+
# For vim:
|
42
|
+
#*.swp
|
43
|
+
|
44
|
+
# For redcar:
|
45
|
+
#.redcar
|
46
|
+
|
47
|
+
# For rubinius:
|
48
|
+
#*.rbc
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
stuff-classifier (0.1)
|
5
|
+
fast-stemmer (>= 1.0)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: http://rubygems.org/
|
9
|
+
specs:
|
10
|
+
ansi (1.4.1)
|
11
|
+
archive-tar-minitar (0.5.2)
|
12
|
+
columnize (0.3.4)
|
13
|
+
fast-stemmer (1.0.0)
|
14
|
+
json (1.6.5)
|
15
|
+
linecache19 (0.5.12)
|
16
|
+
ruby_core_source (>= 0.1.4)
|
17
|
+
minitest (2.10.1)
|
18
|
+
rake (0.9.2.2)
|
19
|
+
rcov (0.9.11)
|
20
|
+
rdoc (3.12)
|
21
|
+
json (~> 1.4)
|
22
|
+
ruby-debug-base19 (0.11.25)
|
23
|
+
columnize (>= 0.3.1)
|
24
|
+
linecache19 (>= 0.5.11)
|
25
|
+
ruby_core_source (>= 0.1.4)
|
26
|
+
ruby-debug19 (0.11.6)
|
27
|
+
columnize (>= 0.3.1)
|
28
|
+
linecache19 (>= 0.5.11)
|
29
|
+
ruby-debug-base19 (>= 0.11.19)
|
30
|
+
ruby_core_source (0.1.5)
|
31
|
+
archive-tar-minitar (>= 0.5.2)
|
32
|
+
turn (0.8.3)
|
33
|
+
ansi
|
34
|
+
|
35
|
+
PLATFORMS
|
36
|
+
ruby
|
37
|
+
|
38
|
+
DEPENDENCIES
|
39
|
+
bundler
|
40
|
+
minitest (>= 2.10)
|
41
|
+
rake (>= 0.9.2)
|
42
|
+
rcov (>= 0.9)
|
43
|
+
rdoc (>= 3.1)
|
44
|
+
ruby-debug19
|
45
|
+
stuff-classifier!
|
46
|
+
turn (>= 0.8.3)
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Alexandru Nedelcu
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
# stuff-classifier
|
2
|
+
|
3
|
+
A library for classifying text into multiple categories.
|
4
|
+
|
5
|
+
Currently provided classifiers:
|
6
|
+
|
7
|
+
- a [naive bayes classifier](http://en.wikipedia.org/wiki/Naive_Bayes_classifier)
|
8
|
+
- a classifier based on [tf-idf weights](http://en.wikipedia.org/wiki/Tf%E2%80%93idf)
|
9
|
+
|
10
|
+
Ran a benchmark of 1345 items that I have previously manually
|
11
|
+
classified with multiple categories. Here's the rate over which the 2
|
12
|
+
algorithms have correctly detected one of those categories:
|
13
|
+
|
14
|
+
- Bayes: 79.26%
|
15
|
+
- Tf-Idf: 81.34%
|
16
|
+
|
17
|
+
I prefer the Naive Bayes approach, because while having lower stats on
|
18
|
+
this benchmark, it seems to make better decisions than I did in many
|
19
|
+
cases. For example, an item with title *"Paintball Session, 100 Balls
|
20
|
+
and Equipment"* was classified as *"Activities"* by me, but the bayes
|
21
|
+
classifier identified it as *"Sports"*, at which point I had an
|
22
|
+
intellectual orgasm. Also, the Tf-Idf classifier seems to do better on
|
23
|
+
clear-cut cases, but doesn't seem to handle uncertainty so well. Of
|
24
|
+
course, these are just quick tests I made and I have no idea which is
|
25
|
+
really better.
|
26
|
+
|
27
|
+
## Install
|
28
|
+
|
29
|
+
```bash
|
30
|
+
gem install stuff-classifier
|
31
|
+
```
|
32
|
+
|
33
|
+
## Usage
|
34
|
+
|
35
|
+
You either instantiate one class or the other. Both have the same
|
36
|
+
signature:
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
require 'stuff-classifier'
|
40
|
+
|
41
|
+
# for the naive bayes implementation
|
42
|
+
cls = StuffClassifier::Bayes.new("Cats or Dogs")
|
43
|
+
|
44
|
+
# for the Tf-Idf based implementation
|
45
|
+
cls = StuffClassifier::TfIdf.new("Cats or Dogs")
|
46
|
+
```
|
47
|
+
|
48
|
+
Training the classifier:
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
cls.train(:dog, "Dogs are awesome, cats too. I love my dog")
|
52
|
+
cls.train(:cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog")
|
53
|
+
cls.train(:dog, "My dog's name is Willy. He likes to play with my wife's cat all day long. I love dogs")
|
54
|
+
cls.train(:cat, "Cats are difficult animals, unlike dogs, really annoying, I hate them all")
|
55
|
+
cls.train(:dog, "So which one should you choose? A dog, definitely.")
|
56
|
+
cls.train(:cat, "The favorite food for cats is bird meat, although mice are good, but birds are a delicacy")
|
57
|
+
cls.train(:dog, "A dog will eat anything, including birds or whatever meat")
|
58
|
+
cls.train(:cat, "My cat's favorite place to purr is on my keyboard")
|
59
|
+
cls.train(:dog, "My dog's favorite place to take a leak is the tree in front of our house")
|
60
|
+
```
|
61
|
+
|
62
|
+
And finally, classifying stuff:
|
63
|
+
|
64
|
+
```ruby
|
65
|
+
cls.classify("This test is about cats.")
|
66
|
+
#=> :cat
|
67
|
+
cls.classify("I hate ...")
|
68
|
+
#=> :cat
|
69
|
+
cls.classify("The most annoying animal on earth.")
|
70
|
+
#=> :cat
|
71
|
+
cls.classify("The preferred company of software developers.")
|
72
|
+
#=> :cat
|
73
|
+
cls.classify("My precious, my favorite!")
|
74
|
+
#=> :cat
|
75
|
+
cls.classify("Kill that bird!")
|
76
|
+
#=> :cat
|
77
|
+
|
78
|
+
cls.classify("This test is about dogs.")
|
79
|
+
#=> :dog
|
80
|
+
cls.classify("Cats or Dogs?")
|
81
|
+
#=> :dog
|
82
|
+
cls.classify("What pet will I love more?")
|
83
|
+
#=> :dog
|
84
|
+
cls.classify("Willy, where the heck are you?")
|
85
|
+
#=> :dog
|
86
|
+
cls.classify("I like big buts and I cannot lie.")
|
87
|
+
#=> :dog
|
88
|
+
cls.classify("Why is the front door of our house open?")
|
89
|
+
#=> :dog
|
90
|
+
cls.classify("Who is eating my meat?")
|
91
|
+
#=> :dog
|
92
|
+
```
|
93
|
+
|
94
|
+
## TODO
|
95
|
+
|
96
|
+
- provide more implementations
|
97
|
+
- plugable storage mechanism (in-memory, on disk, database)
|
98
|
+
|
99
|
+
## License
|
100
|
+
|
101
|
+
MIT Licensed. See LICENSE.txt for details.
|
102
|
+
|
103
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'stuff-classifier'
|
4
|
+
|
5
|
+
Rake::TestTask.new(:test) do |test|
|
6
|
+
test.libs << 'lib' << 'test'
|
7
|
+
test.pattern = 'test/**/test_*.rb'
|
8
|
+
test.verbose = true
|
9
|
+
end
|
10
|
+
|
11
|
+
require 'rcov/rcovtask'
|
12
|
+
Rcov::RcovTask.new do |test|
|
13
|
+
test.libs << 'test'
|
14
|
+
test.pattern = 'test/**/test_*.rb'
|
15
|
+
test.verbose = true
|
16
|
+
test.rcov_opts << '--exclude "gems/*"'
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'rdoc/task'
|
20
|
+
RDoc::Task.new do |rdoc|
|
21
|
+
version = StuffClassifier::VERSION
|
22
|
+
|
23
|
+
rdoc.rdoc_dir = 'rdoc'
|
24
|
+
rdoc.title = "stuff-classifier #{version}"
|
25
|
+
rdoc.rdoc_files.include('README*')
|
26
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
27
|
+
end
|
28
|
+
|
29
|
+
task :default => :test
|
30
|
+
|
@@ -0,0 +1,8 @@
|
|
1
|
+
module StuffClassifier
|
2
|
+
autoload :VERSION, 'stuff-classifier/version'
|
3
|
+
autoload :STOP_WORDS, 'stuff-classifier/stop_words'
|
4
|
+
autoload :Tokenizer, 'stuff-classifier/tokenizer'
|
5
|
+
autoload :Base, 'stuff-classifier/base'
|
6
|
+
autoload :Bayes, 'stuff-classifier/bayes'
|
7
|
+
autoload :TfIdf, 'stuff-classifier/tf-idf'
|
8
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
class StuffClassifier::Base
|
2
|
+
include StuffClassifier::Tokenizer
|
3
|
+
|
4
|
+
def initialize(name, opts={})
|
5
|
+
@name = name
|
6
|
+
@stemming = opts.key?(:stemming) ? opts[:stemming] : true
|
7
|
+
@wcount = {}
|
8
|
+
@ccount = {}
|
9
|
+
@ignore_words = nil
|
10
|
+
end
|
11
|
+
|
12
|
+
def incr_word(word, category)
|
13
|
+
@wcount[word] ||= {}
|
14
|
+
@wcount[word][category] ||= 0
|
15
|
+
@wcount[word][category] += 1
|
16
|
+
end
|
17
|
+
|
18
|
+
def incr_cat(category)
|
19
|
+
@ccount[category] ||= 0
|
20
|
+
@ccount[category] += 1
|
21
|
+
end
|
22
|
+
|
23
|
+
def word_count(word, category)
|
24
|
+
return 0.0 unless @wcount[word] && @wcount[word][category]
|
25
|
+
@wcount[word][category].to_f
|
26
|
+
end
|
27
|
+
|
28
|
+
def cat_count(category)
|
29
|
+
@ccount[category] ? @ccount[category].to_f : 0.0
|
30
|
+
end
|
31
|
+
|
32
|
+
def total_count
|
33
|
+
@ccount.values.inject(0){|s,c| s + c}.to_f
|
34
|
+
end
|
35
|
+
|
36
|
+
def categories
|
37
|
+
@ccount.keys
|
38
|
+
end
|
39
|
+
|
40
|
+
def train(category, text)
|
41
|
+
each_word(text) {|w| incr_word(w, category) }
|
42
|
+
incr_cat(category)
|
43
|
+
end
|
44
|
+
|
45
|
+
def word_prob(word, cat)
|
46
|
+
return 0.0 if cat_count(cat) == 0
|
47
|
+
word_count(word, cat) / cat_count(cat)
|
48
|
+
end
|
49
|
+
|
50
|
+
def word_weighted_average(word, cat, opts={})
|
51
|
+
func = opts[:func]
|
52
|
+
weight = opts[:weight] || 1.0
|
53
|
+
assumed_prob = opts[:assumed_prob] || 0.5
|
54
|
+
|
55
|
+
# calculate current probability
|
56
|
+
basic_prob = func ? func.call(word, cat)
|
57
|
+
: word_prob(word, cat)
|
58
|
+
|
59
|
+
# count the number of times this word has appeared in all
|
60
|
+
# categories
|
61
|
+
totals = categories.map{|c| word_count(word, c)}.inject(0){|s,c| s + c}
|
62
|
+
|
63
|
+
# the final weighted average
|
64
|
+
(weight * assumed_prob + totals * basic_prob) / (weight + totals)
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
|
2
|
+
class StuffClassifier::Bayes < StuffClassifier::Base
|
3
|
+
# http://en.wikipedia.org/wiki/Naive_Bayes_classifier
|
4
|
+
|
5
|
+
def initialize(name, opts={})
|
6
|
+
super(name, opts)
|
7
|
+
@thresholds = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def doc_prob(text, category)
|
11
|
+
each_word(text).map {|w|
|
12
|
+
word_weighted_average(w, category)
|
13
|
+
}.inject(1) {|p,c| p * c}
|
14
|
+
end
|
15
|
+
|
16
|
+
def text_prob(text, category)
|
17
|
+
cat_prob = cat_count(category) / total_count
|
18
|
+
doc_prob = doc_prob(text, category)
|
19
|
+
cat_prob * doc_prob
|
20
|
+
end
|
21
|
+
|
22
|
+
def cat_scores(text)
|
23
|
+
probs = {}
|
24
|
+
categories.each do |cat|
|
25
|
+
probs[cat] = text_prob(text, cat)
|
26
|
+
end
|
27
|
+
probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
|
28
|
+
end
|
29
|
+
|
30
|
+
def classify(text, default=nil)
|
31
|
+
# Find the category with the highest probability
|
32
|
+
max_prob = 0.0
|
33
|
+
best = nil
|
34
|
+
|
35
|
+
scores = cat_scores(text)
|
36
|
+
scores.each do |score|
|
37
|
+
cat, prob = score
|
38
|
+
if prob > max_prob
|
39
|
+
max_prob = prob
|
40
|
+
best = cat
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
return default unless best
|
45
|
+
threshold = @thresholds[best] || 1.0
|
46
|
+
|
47
|
+
scores.each do |score|
|
48
|
+
cat, prob = score
|
49
|
+
next if cat == best
|
50
|
+
return default if prob * threshold > max_prob
|
51
|
+
end
|
52
|
+
|
53
|
+
return best
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
StuffClassifier::STOP_WORDS = Set.new [
|
2
|
+
'a', 'about', 'above', 'across', 'after', 'afterwards',
|
3
|
+
'again', 'against', 'all', 'almost', 'alone', 'along',
|
4
|
+
'already', 'also', 'although', 'always', 'am', 'among',
|
5
|
+
'amongst', 'amoungst', 'amount', 'an', 'and', 'another',
|
6
|
+
'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere',
|
7
|
+
'are', 'around', 'as', 'at', 'back', 'be',
|
8
|
+
'became', 'because', 'become', 'becomes', 'becoming', 'been',
|
9
|
+
'before', 'beforehand', 'behind', 'being', 'below', 'beside',
|
10
|
+
'besides', 'between', 'beyond', 'bill', 'both', 'bottom',
|
11
|
+
'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
|
12
|
+
'co', 'computer', 'con', 'could', 'couldnt', 'cry',
|
13
|
+
'de', 'describe', 'detail', 'do', 'done', 'down',
|
14
|
+
'due', 'during', 'each', 'eg', 'eight', 'either',
|
15
|
+
'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every',
|
16
|
+
'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen',
|
17
|
+
'fify', 'fill', 'find', 'fire', 'first', 'five',
|
18
|
+
'for', 'former', 'formerly', 'forty', 'found', 'four',
|
19
|
+
'from', 'front', 'full', 'further', 'get', 'give',
|
20
|
+
'go', 'had', 'has', 'hasnt', 'have', 'he',
|
21
|
+
'hence', 'her', 'here', 'hereafter', 'hereby', 'herein',
|
22
|
+
'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
|
23
|
+
'how', 'however', 'hundred', 'i', 'ie', 'if',
|
24
|
+
'in', 'inc', 'indeed', 'interest', 'into', 'is',
|
25
|
+
'it', 'its', 'itself', 'keep', 'last', 'latter',
|
26
|
+
'latterly', 'least', 'less', 'ltd', 'made', 'many',
|
27
|
+
'may', 'me', 'meanwhile', 'might', 'mill', 'mine',
|
28
|
+
'more', 'moreover', 'most', 'mostly', 'move', 'much',
|
29
|
+
'must', 'my', 'myself', 'name', 'namely', 'neither',
|
30
|
+
'never', 'nevertheless', 'next', 'nine', 'no', 'nobody',
|
31
|
+
'none', 'noone', 'nor', 'not', 'nothing', 'now',
|
32
|
+
'nowhere', 'of', 'off', 'often', 'on', 'once',
|
33
|
+
'one', 'only', 'onto', 'or', 'other', 'others',
|
34
|
+
'otherwise', 'our', 'ours', 'ourselves', 'out', 'over',
|
35
|
+
'own', 'part', 'per', 'perhaps', 'please', 'put',
|
36
|
+
'rather', 're', 'same', 'see', 'seem', 'seemed',
|
37
|
+
'seeming', 'seems', 'serious', 'several', 'she', 'should',
|
38
|
+
'show', 'side', 'since', 'sincere', 'six', 'sixty',
|
39
|
+
'so', 'some', 'somehow', 'someone', 'something', 'sometime',
|
40
|
+
'sometimes', 'somewhere', 'still', 'such', 'system', 'take',
|
41
|
+
'ten', 'than', 'that', 'the', 'their', 'them',
|
42
|
+
'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby',
|
43
|
+
'therefore', 'therein', 'thereupon', 'these', 'they', 'thick',
|
44
|
+
'thin', 'third', 'this', 'those', 'though', 'three',
|
45
|
+
'through', 'throughout', 'thru', 'thus', 'to', 'together',
|
46
|
+
'too', 'top', 'toward', 'towards', 'twelve', 'twenty',
|
47
|
+
'two', 'un', 'under', 'until', 'up', 'upon',
|
48
|
+
'us', 'very', 'via', 'was', 'we', 'well',
|
49
|
+
'were', 'what', 'whatever', 'when', 'whence', 'whenever',
|
50
|
+
'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
|
51
|
+
'wherever', 'whether', 'which', 'while', 'whither', 'who',
|
52
|
+
'whoever', 'whole', 'whom', 'whose', 'why', 'will',
|
53
|
+
'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
|
54
|
+
'yourself', 'yourselves'
|
55
|
+
]
|
@@ -0,0 +1,41 @@
|
|
1
|
+
class StuffClassifier::TfIdf < StuffClassifier::Base
|
2
|
+
def tf_idf(word, cat)
|
3
|
+
word_cat_nr = word_count(word, cat)
|
4
|
+
cat_nr = cat_count(cat)
|
5
|
+
tf = 1.0 * word_cat_nr / cat_nr
|
6
|
+
|
7
|
+
total_categories = categories.length
|
8
|
+
categories_with_word = (@wcount[word] || []).length
|
9
|
+
|
10
|
+
idf = Math.log((total_categories + 2) / (categories_with_word + 1.0), 10)
|
11
|
+
return tf * idf
|
12
|
+
end
|
13
|
+
|
14
|
+
def text_prob(text, cat)
|
15
|
+
each_word(text).map{|w| tf_idf(w, cat)}.inject(0){|s,p| s + p}
|
16
|
+
end
|
17
|
+
|
18
|
+
def cat_scores(text)
|
19
|
+
probs = {}
|
20
|
+
categories.each do |cat|
|
21
|
+
p = text_prob(text, cat)
|
22
|
+
probs[cat] = p
|
23
|
+
end
|
24
|
+
probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
|
25
|
+
end
|
26
|
+
|
27
|
+
def classify(text, default=nil)
|
28
|
+
max_prob = 0.0
|
29
|
+
best = nil
|
30
|
+
|
31
|
+
cat_scores(text).each do |score|
|
32
|
+
cat, prob = score
|
33
|
+
if prob > max_prob
|
34
|
+
max_prob = prob
|
35
|
+
best = cat
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
max_prob > 0 ? best : default
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'fast_stemmer'
|
2
|
+
|
3
|
+
module StuffClassifier::Tokenizer
|
4
|
+
|
5
|
+
def ignore_words=(value)
|
6
|
+
@ignore_words = value
|
7
|
+
end
|
8
|
+
|
9
|
+
def ignore_words
|
10
|
+
@ignore_words || StuffClassifier::STOP_WORDS
|
11
|
+
end
|
12
|
+
|
13
|
+
def stemming?
|
14
|
+
defined?(@stemming) ? @stemming : false
|
15
|
+
end
|
16
|
+
|
17
|
+
def stemming=(value)
|
18
|
+
@stemming = value
|
19
|
+
end
|
20
|
+
|
21
|
+
def each_word(string)
|
22
|
+
string = string.strip
|
23
|
+
return if string == ''
|
24
|
+
|
25
|
+
words = []
|
26
|
+
|
27
|
+
cnt = string.gsub(/['`]/, '')
|
28
|
+
cnt.split("\n").each do |line|
|
29
|
+
line_cnt = line.gsub(/[^a-zA-Z]+/, ' ')
|
30
|
+
line_cnt.split(/\s+/).each do |w|
|
31
|
+
next if w == '' || ignore_words.member?(w.downcase)
|
32
|
+
|
33
|
+
if stemming?
|
34
|
+
w = w.stem.downcase
|
35
|
+
next if ignore_words.member?(w)
|
36
|
+
else
|
37
|
+
w = w.downcase
|
38
|
+
end
|
39
|
+
|
40
|
+
yield w if block_given?
|
41
|
+
words << w
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
return words
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "stuff-classifier/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "stuff-classifier"
|
7
|
+
s.version = StuffClassifier::VERSION
|
8
|
+
s.authors = ["Alexandru Nedelcu"]
|
9
|
+
s.email = ["me@alexn.org"]
|
10
|
+
s.homepage = "https://github.com/alexandru/stuff-classifier/"
|
11
|
+
s.summary = %q{Simple text classifier(s) implemetation}
|
12
|
+
s.description = %q{2 methods are provided for now - (1) naive bayes implementation + (2) tf-idf weights}
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
|
19
|
+
s.add_runtime_dependency "fast-stemmer", ">= 1.0"
|
20
|
+
|
21
|
+
s.add_development_dependency "bundler"
|
22
|
+
s.add_development_dependency "rake", ">= 0.9.2"
|
23
|
+
s.add_development_dependency "minitest", ">= 2.10"
|
24
|
+
s.add_development_dependency "turn", ">= 0.8.3"
|
25
|
+
s.add_development_dependency "rdoc", ">= 3.1"
|
26
|
+
s.add_development_dependency "rcov", ">= 0.9"
|
27
|
+
s.add_development_dependency "ruby-debug19"
|
28
|
+
end
|
29
|
+
|
data/test/helper.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'stuff-classifier'
|
3
|
+
require 'turn'
|
4
|
+
|
5
|
+
Turn.config do |c|
|
6
|
+
# use one of output formats:
|
7
|
+
# :outline - turn's original case/test outline mode [default]
|
8
|
+
# :progress - indicates progress with progress bar
|
9
|
+
# :dotted - test/unit's traditional dot-progress mode
|
10
|
+
# :pretty - new pretty reporter
|
11
|
+
# :marshal - dump output as YAML (normal run mode only)
|
12
|
+
# :cue - interactive testing
|
13
|
+
c.format = :outline
|
14
|
+
# turn on invoke/execute tracing, enable full backtrace
|
15
|
+
c.trace = true
|
16
|
+
# use humanized test names (works only with :outline format)
|
17
|
+
c.natural = true
|
18
|
+
end
|
19
|
+
|
20
|
+
class TestBase < MiniTest::Unit::TestCase
|
21
|
+
def self.before(&block)
|
22
|
+
@on_setup = block if block
|
23
|
+
@on_setup
|
24
|
+
end
|
25
|
+
|
26
|
+
def setup
|
27
|
+
on_setup = self.class.before
|
28
|
+
instance_eval(&on_setup) if on_setup
|
29
|
+
end
|
30
|
+
|
31
|
+
def set_classifier(instance)
|
32
|
+
@classifier = instance
|
33
|
+
end
|
34
|
+
|
35
|
+
def train(category, value)
|
36
|
+
@classifier.train(category, value)
|
37
|
+
end
|
38
|
+
|
39
|
+
def should_be(category, value)
|
40
|
+
assert_equal category, @classifier.classify(value), value
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class Test001Tokenizer < TestBase
|
4
|
+
before do
|
5
|
+
tokenizer_cls = Class.new do
|
6
|
+
include StuffClassifier::Tokenizer
|
7
|
+
end
|
8
|
+
|
9
|
+
@tokenizer = tokenizer_cls.new
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_simple_tokens
|
13
|
+
assert_equal ["hello", "world"],
|
14
|
+
@tokenizer.each_word('Hello world! How are you?')
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_with_stemming
|
18
|
+
@tokenizer.stemming = true
|
19
|
+
assert_equal(
|
20
|
+
["lot", "dog", "lot", "cat", "inform", "highwai"],
|
21
|
+
@tokenizer.each_word('Lots of dogs, lots of cats! This is the information highway')
|
22
|
+
)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_complicated_tokens
|
26
|
+
words = @tokenizer.each_word("I don't really get what you want to
|
27
|
+
accomplish. There is a class TestEval2, you can do test_eval2 =
|
28
|
+
TestEval2.new afterwards. And: class A ... end always yields nil, so
|
29
|
+
your output is ok I guess ;-)")
|
30
|
+
|
31
|
+
should_return = [
|
32
|
+
"really", "want", "accomplish", "class",
|
33
|
+
"testeval", "test", "eval", "testeval", "new", "class", "end",
|
34
|
+
"yields", "nil", "output", "ok", "guess"]
|
35
|
+
|
36
|
+
assert_equal should_return, words
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
|
4
|
+
class Test002NaiveBayesClassification < TestBase
|
5
|
+
before do
|
6
|
+
set_classifier StuffClassifier::Bayes.new("Cats or Dogs")
|
7
|
+
|
8
|
+
train :dog, "Dogs are awesome, cats too. I love my dog"
|
9
|
+
train :cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog"
|
10
|
+
train :dog, "My dog's name is Willy. He likes to play with my wife's cat all day long. I love dogs"
|
11
|
+
train :cat, "Cats are difficult animals, unlike dogs, really annoying, I hate them all"
|
12
|
+
train :dog, "So which one should you choose? A dog, definitely."
|
13
|
+
train :cat, "The favorite food for cats is bird meat, although mice are good, but birds are a delicacy"
|
14
|
+
train :dog, "A dog will eat anything, including birds or whatever meat"
|
15
|
+
train :cat, "My cat's favorite place to purr is on my keyboard"
|
16
|
+
train :dog, "My dog's favorite place to take a leak is the tree in front of our house"
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_for_cats
|
20
|
+
should_be :cat, "This test is about cats."
|
21
|
+
should_be :cat, "I hate ..."
|
22
|
+
should_be :cat, "The most annoying animal on earth."
|
23
|
+
should_be :cat, "The preferred company of software developers."
|
24
|
+
should_be :cat, "My precious, my favorite!"
|
25
|
+
should_be :cat, "Kill that bird!"
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_for_dogs
|
29
|
+
should_be :dog, "This test is about dogs."
|
30
|
+
should_be :dog, "Cats or Dogs?"
|
31
|
+
should_be :dog, "What pet will I love more?"
|
32
|
+
should_be :dog, "Willy, where the heck are you?"
|
33
|
+
should_be :dog, "I like big buts and I cannot lie."
|
34
|
+
should_be :dog, "Why is the front door of our house open?"
|
35
|
+
should_be :dog, "Who ate my meat?"
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
|
4
|
+
class Test003TfIdfClassification < TestBase
|
5
|
+
before do
|
6
|
+
set_classifier StuffClassifier::TfIdf.new("Cats or Dogs")
|
7
|
+
|
8
|
+
train :dog, "Dogs are awesome, cats too. I love my dog"
|
9
|
+
train :cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog"
|
10
|
+
train :dog, "My dog's name is Willy. He likes to play with my wife's cat all day long. I love dogs"
|
11
|
+
train :cat, "Cats are difficult animals, unlike dogs, really annoying, I hate them all"
|
12
|
+
train :dog, "So which one should you choose? A dog, definitely."
|
13
|
+
train :cat, "The favorite food for cats is bird meat, although mice are good, but birds are a delicacy"
|
14
|
+
train :dog, "A dog will eat anything, including birds or whatever meat"
|
15
|
+
train :cat, "My cat's favorite place to purr is on my keyboard"
|
16
|
+
train :dog, "My dog's favorite place to take a leak is the tree in front of our house"
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_for_cats
|
20
|
+
should_be :cat, "This test is about cats."
|
21
|
+
should_be :cat, "I hate ..."
|
22
|
+
should_be :cat, "The most annoying animal on earth."
|
23
|
+
should_be :cat, "The preferred company of software developers."
|
24
|
+
should_be :cat, "My precious, my favorite!"
|
25
|
+
should_be :cat, "Kill that bird!"
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_for_dogs
|
29
|
+
should_be :dog, "This test is about dogs."
|
30
|
+
should_be :dog, "Cats or Dogs?"
|
31
|
+
should_be :dog, "What pet will I love more?"
|
32
|
+
should_be :dog, "Willy, where the heck are you?"
|
33
|
+
should_be :dog, "I like big buts and I cannot lie."
|
34
|
+
should_be :dog, "Why is the front door of our house open?"
|
35
|
+
should_be :dog, "Who is eating my meat?"
|
36
|
+
end
|
37
|
+
end
|
metadata
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: stuff-classifier
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Alexandru Nedelcu
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-01-19 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: fast-stemmer
|
16
|
+
requirement: &71035640 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *71035640
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: bundler
|
27
|
+
requirement: &71096090 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *71096090
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rake
|
38
|
+
requirement: &71095820 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.9.2
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *71095820
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: minitest
|
49
|
+
requirement: &71095570 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '2.10'
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *71095570
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: turn
|
60
|
+
requirement: &71095340 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: 0.8.3
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *71095340
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rdoc
|
71
|
+
requirement: &71095110 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '3.1'
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: *71095110
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: rcov
|
82
|
+
requirement: &71094880 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0.9'
|
88
|
+
type: :development
|
89
|
+
prerelease: false
|
90
|
+
version_requirements: *71094880
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: ruby-debug19
|
93
|
+
requirement: &71094690 !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
95
|
+
requirements:
|
96
|
+
- - ! '>='
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
type: :development
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: *71094690
|
102
|
+
description: 2 methods are provided for now - (1) naive bayes implementation + (2)
|
103
|
+
tf-idf weights
|
104
|
+
email:
|
105
|
+
- me@alexn.org
|
106
|
+
executables: []
|
107
|
+
extensions: []
|
108
|
+
extra_rdoc_files: []
|
109
|
+
files:
|
110
|
+
- .gitignore
|
111
|
+
- Gemfile
|
112
|
+
- Gemfile.lock
|
113
|
+
- LICENSE.txt
|
114
|
+
- README.md
|
115
|
+
- Rakefile
|
116
|
+
- lib/stuff-classifier.rb
|
117
|
+
- lib/stuff-classifier/base.rb
|
118
|
+
- lib/stuff-classifier/bayes.rb
|
119
|
+
- lib/stuff-classifier/stop_words.rb
|
120
|
+
- lib/stuff-classifier/tf-idf.rb
|
121
|
+
- lib/stuff-classifier/tokenizer.rb
|
122
|
+
- lib/stuff-classifier/version.rb
|
123
|
+
- stuff-classifier.gemspec
|
124
|
+
- test/helper.rb
|
125
|
+
- test/test_001_tokenizer.rb
|
126
|
+
- test/test_002_naive_bayes.rb
|
127
|
+
- test/test_003_tf_idf.rb
|
128
|
+
homepage: https://github.com/alexandru/stuff-classifier/
|
129
|
+
licenses: []
|
130
|
+
post_install_message:
|
131
|
+
rdoc_options: []
|
132
|
+
require_paths:
|
133
|
+
- lib
|
134
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
135
|
+
none: false
|
136
|
+
requirements:
|
137
|
+
- - ! '>='
|
138
|
+
- !ruby/object:Gem::Version
|
139
|
+
version: '0'
|
140
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
141
|
+
none: false
|
142
|
+
requirements:
|
143
|
+
- - ! '>='
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
requirements: []
|
147
|
+
rubyforge_project:
|
148
|
+
rubygems_version: 1.8.6
|
149
|
+
signing_key:
|
150
|
+
specification_version: 3
|
151
|
+
summary: Simple text classifier(s) implemetation
|
152
|
+
test_files: []
|