stuff-classifier 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +48 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +46 -0
- data/LICENSE.txt +20 -0
- data/README.md +103 -0
- data/Rakefile +30 -0
- data/lib/stuff-classifier.rb +8 -0
- data/lib/stuff-classifier/base.rb +66 -0
- data/lib/stuff-classifier/bayes.rb +55 -0
- data/lib/stuff-classifier/stop_words.rb +55 -0
- data/lib/stuff-classifier/tf-idf.rb +41 -0
- data/lib/stuff-classifier/tokenizer.rb +48 -0
- data/lib/stuff-classifier/version.rb +3 -0
- data/stuff-classifier.gemspec +29 -0
- data/test/helper.rb +42 -0
- data/test/test_001_tokenizer.rb +39 -0
- data/test/test_002_naive_bayes.rb +37 -0
- data/test/test_003_tf_idf.rb +37 -0
- metadata +152 -0
data/.gitignore
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# rcov generated
|
2
|
+
coverage
|
3
|
+
|
4
|
+
# rdoc generated
|
5
|
+
rdoc
|
6
|
+
|
7
|
+
# yard generated
|
8
|
+
doc
|
9
|
+
.yardoc
|
10
|
+
|
11
|
+
# bundler
|
12
|
+
.bundle
|
13
|
+
|
14
|
+
# jeweler generated
|
15
|
+
pkg
|
16
|
+
|
17
|
+
# Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
|
18
|
+
#
|
19
|
+
# * Create a file at ~/.gitignore
|
20
|
+
# * Include files you want ignored
|
21
|
+
# * Run: git config --global core.excludesfile ~/.gitignore
|
22
|
+
#
|
23
|
+
# After doing this, these files will be ignored in all your git projects,
|
24
|
+
# saving you from having to 'pollute' every project you touch with them
|
25
|
+
#
|
26
|
+
# Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
|
27
|
+
#
|
28
|
+
# For MacOS:
|
29
|
+
#
|
30
|
+
#.DS_Store
|
31
|
+
|
32
|
+
# For TextMate
|
33
|
+
#*.tmproj
|
34
|
+
#tmtags
|
35
|
+
|
36
|
+
# For emacs:
|
37
|
+
#*~
|
38
|
+
#\#*
|
39
|
+
#.\#*
|
40
|
+
|
41
|
+
# For vim:
|
42
|
+
#*.swp
|
43
|
+
|
44
|
+
# For redcar:
|
45
|
+
#.redcar
|
46
|
+
|
47
|
+
# For rubinius:
|
48
|
+
#*.rbc
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
stuff-classifier (0.1)
|
5
|
+
fast-stemmer (>= 1.0)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: http://rubygems.org/
|
9
|
+
specs:
|
10
|
+
ansi (1.4.1)
|
11
|
+
archive-tar-minitar (0.5.2)
|
12
|
+
columnize (0.3.4)
|
13
|
+
fast-stemmer (1.0.0)
|
14
|
+
json (1.6.5)
|
15
|
+
linecache19 (0.5.12)
|
16
|
+
ruby_core_source (>= 0.1.4)
|
17
|
+
minitest (2.10.1)
|
18
|
+
rake (0.9.2.2)
|
19
|
+
rcov (0.9.11)
|
20
|
+
rdoc (3.12)
|
21
|
+
json (~> 1.4)
|
22
|
+
ruby-debug-base19 (0.11.25)
|
23
|
+
columnize (>= 0.3.1)
|
24
|
+
linecache19 (>= 0.5.11)
|
25
|
+
ruby_core_source (>= 0.1.4)
|
26
|
+
ruby-debug19 (0.11.6)
|
27
|
+
columnize (>= 0.3.1)
|
28
|
+
linecache19 (>= 0.5.11)
|
29
|
+
ruby-debug-base19 (>= 0.11.19)
|
30
|
+
ruby_core_source (0.1.5)
|
31
|
+
archive-tar-minitar (>= 0.5.2)
|
32
|
+
turn (0.8.3)
|
33
|
+
ansi
|
34
|
+
|
35
|
+
PLATFORMS
|
36
|
+
ruby
|
37
|
+
|
38
|
+
DEPENDENCIES
|
39
|
+
bundler
|
40
|
+
minitest (>= 2.10)
|
41
|
+
rake (>= 0.9.2)
|
42
|
+
rcov (>= 0.9)
|
43
|
+
rdoc (>= 3.1)
|
44
|
+
ruby-debug19
|
45
|
+
stuff-classifier!
|
46
|
+
turn (>= 0.8.3)
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Alexandru Nedelcu
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
# stuff-classifier
|
2
|
+
|
3
|
+
A library for classifying text into multiple categories.
|
4
|
+
|
5
|
+
Currently provided classifiers:
|
6
|
+
|
7
|
+
- a [naive bayes classifier](http://en.wikipedia.org/wiki/Naive_Bayes_classifier)
|
8
|
+
- a classifier based on [tf-idf weights](http://en.wikipedia.org/wiki/Tf%E2%80%93idf)
|
9
|
+
|
10
|
+
Ran a benchmark of 1345 items that I have previously manually
|
11
|
+
classified with multiple categories. Here's the rate over which the 2
|
12
|
+
algorithms have correctly detected one of those categories:
|
13
|
+
|
14
|
+
- Bayes: 79.26%
|
15
|
+
- Tf-Idf: 81.34%
|
16
|
+
|
17
|
+
I prefer the Naive Bayes approach, because while having lower stats on
|
18
|
+
this benchmark, it seems to make better decisions than I did in many
|
19
|
+
cases. For example, an item with title *"Paintball Session, 100 Balls
|
20
|
+
and Equipment"* was classified as *"Activities"* by me, but the bayes
|
21
|
+
classifier identified it as *"Sports"*, at which point I had an
|
22
|
+
intellectual orgasm. Also, the Tf-Idf classifier seems to do better on
|
23
|
+
clear-cut cases, but doesn't seem to handle uncertainty so well. Of
|
24
|
+
course, these are just quick tests I made and I have no idea which is
|
25
|
+
really better.
|
26
|
+
|
27
|
+
## Install
|
28
|
+
|
29
|
+
```bash
|
30
|
+
gem install stuff-classifier
|
31
|
+
```
|
32
|
+
|
33
|
+
## Usage
|
34
|
+
|
35
|
+
You either instantiate one class or the other. Both have the same
|
36
|
+
signature:
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
require 'stuff-classifier'
|
40
|
+
|
41
|
+
# for the naive bayes implementation
|
42
|
+
cls = StuffClassifier::Bayes.new("Cats or Dogs")
|
43
|
+
|
44
|
+
# for the Tf-Idf based implementation
|
45
|
+
cls = StuffClassifier::TfIdf.new("Cats or Dogs")
|
46
|
+
```
|
47
|
+
|
48
|
+
Training the classifier:
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
cls.train(:dog, "Dogs are awesome, cats too. I love my dog")
|
52
|
+
cls.train(:cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog")
|
53
|
+
cls.train(:dog, "My dog's name is Willy. He likes to play with my wife's cat all day long. I love dogs")
|
54
|
+
cls.train(:cat, "Cats are difficult animals, unlike dogs, really annoying, I hate them all")
|
55
|
+
cls.train(:dog, "So which one should you choose? A dog, definitely.")
|
56
|
+
cls.train(:cat, "The favorite food for cats is bird meat, although mice are good, but birds are a delicacy")
|
57
|
+
cls.train(:dog, "A dog will eat anything, including birds or whatever meat")
|
58
|
+
cls.train(:cat, "My cat's favorite place to purr is on my keyboard")
|
59
|
+
cls.train(:dog, "My dog's favorite place to take a leak is the tree in front of our house")
|
60
|
+
```
|
61
|
+
|
62
|
+
And finally, classifying stuff:
|
63
|
+
|
64
|
+
```ruby
|
65
|
+
cls.classify("This test is about cats.")
|
66
|
+
#=> :cat
|
67
|
+
cls.classify("I hate ...")
|
68
|
+
#=> :cat
|
69
|
+
cls.classify("The most annoying animal on earth.")
|
70
|
+
#=> :cat
|
71
|
+
cls.classify("The preferred company of software developers.")
|
72
|
+
#=> :cat
|
73
|
+
cls.classify("My precious, my favorite!")
|
74
|
+
#=> :cat
|
75
|
+
cls.classify("Kill that bird!")
|
76
|
+
#=> :cat
|
77
|
+
|
78
|
+
cls.classify("This test is about dogs.")
|
79
|
+
#=> :dog
|
80
|
+
cls.classify("Cats or Dogs?")
|
81
|
+
#=> :dog
|
82
|
+
cls.classify("What pet will I love more?")
|
83
|
+
#=> :dog
|
84
|
+
cls.classify("Willy, where the heck are you?")
|
85
|
+
#=> :dog
|
86
|
+
cls.classify("I like big buts and I cannot lie.")
|
87
|
+
#=> :dog
|
88
|
+
cls.classify("Why is the front door of our house open?")
|
89
|
+
#=> :dog
|
90
|
+
cls.classify("Who is eating my meat?")
|
91
|
+
#=> :dog
|
92
|
+
```
|
93
|
+
|
94
|
+
## TODO
|
95
|
+
|
96
|
+
- provide more implementations
|
97
|
+
- plugable storage mechanism (in-memory, on disk, database)
|
98
|
+
|
99
|
+
## License
|
100
|
+
|
101
|
+
MIT Licensed. See LICENSE.txt for details.
|
102
|
+
|
103
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'stuff-classifier'
|
4
|
+
|
5
|
+
Rake::TestTask.new(:test) do |test|
|
6
|
+
test.libs << 'lib' << 'test'
|
7
|
+
test.pattern = 'test/**/test_*.rb'
|
8
|
+
test.verbose = true
|
9
|
+
end
|
10
|
+
|
11
|
+
require 'rcov/rcovtask'
|
12
|
+
Rcov::RcovTask.new do |test|
|
13
|
+
test.libs << 'test'
|
14
|
+
test.pattern = 'test/**/test_*.rb'
|
15
|
+
test.verbose = true
|
16
|
+
test.rcov_opts << '--exclude "gems/*"'
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'rdoc/task'
|
20
|
+
RDoc::Task.new do |rdoc|
|
21
|
+
version = StuffClassifier::VERSION
|
22
|
+
|
23
|
+
rdoc.rdoc_dir = 'rdoc'
|
24
|
+
rdoc.title = "stuff-classifier #{version}"
|
25
|
+
rdoc.rdoc_files.include('README*')
|
26
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
27
|
+
end
|
28
|
+
|
29
|
+
task :default => :test
|
30
|
+
|
@@ -0,0 +1,8 @@
|
|
1
|
+
module StuffClassifier
|
2
|
+
autoload :VERSION, 'stuff-classifier/version'
|
3
|
+
autoload :STOP_WORDS, 'stuff-classifier/stop_words'
|
4
|
+
autoload :Tokenizer, 'stuff-classifier/tokenizer'
|
5
|
+
autoload :Base, 'stuff-classifier/base'
|
6
|
+
autoload :Bayes, 'stuff-classifier/bayes'
|
7
|
+
autoload :TfIdf, 'stuff-classifier/tf-idf'
|
8
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
class StuffClassifier::Base
|
2
|
+
include StuffClassifier::Tokenizer
|
3
|
+
|
4
|
+
def initialize(name, opts={})
|
5
|
+
@name = name
|
6
|
+
@stemming = opts.key?(:stemming) ? opts[:stemming] : true
|
7
|
+
@wcount = {}
|
8
|
+
@ccount = {}
|
9
|
+
@ignore_words = nil
|
10
|
+
end
|
11
|
+
|
12
|
+
def incr_word(word, category)
|
13
|
+
@wcount[word] ||= {}
|
14
|
+
@wcount[word][category] ||= 0
|
15
|
+
@wcount[word][category] += 1
|
16
|
+
end
|
17
|
+
|
18
|
+
def incr_cat(category)
|
19
|
+
@ccount[category] ||= 0
|
20
|
+
@ccount[category] += 1
|
21
|
+
end
|
22
|
+
|
23
|
+
def word_count(word, category)
|
24
|
+
return 0.0 unless @wcount[word] && @wcount[word][category]
|
25
|
+
@wcount[word][category].to_f
|
26
|
+
end
|
27
|
+
|
28
|
+
def cat_count(category)
|
29
|
+
@ccount[category] ? @ccount[category].to_f : 0.0
|
30
|
+
end
|
31
|
+
|
32
|
+
def total_count
|
33
|
+
@ccount.values.inject(0){|s,c| s + c}.to_f
|
34
|
+
end
|
35
|
+
|
36
|
+
def categories
|
37
|
+
@ccount.keys
|
38
|
+
end
|
39
|
+
|
40
|
+
def train(category, text)
|
41
|
+
each_word(text) {|w| incr_word(w, category) }
|
42
|
+
incr_cat(category)
|
43
|
+
end
|
44
|
+
|
45
|
+
def word_prob(word, cat)
|
46
|
+
return 0.0 if cat_count(cat) == 0
|
47
|
+
word_count(word, cat) / cat_count(cat)
|
48
|
+
end
|
49
|
+
|
50
|
+
def word_weighted_average(word, cat, opts={})
|
51
|
+
func = opts[:func]
|
52
|
+
weight = opts[:weight] || 1.0
|
53
|
+
assumed_prob = opts[:assumed_prob] || 0.5
|
54
|
+
|
55
|
+
# calculate current probability
|
56
|
+
basic_prob = func ? func.call(word, cat)
|
57
|
+
: word_prob(word, cat)
|
58
|
+
|
59
|
+
# count the number of times this word has appeared in all
|
60
|
+
# categories
|
61
|
+
totals = categories.map{|c| word_count(word, c)}.inject(0){|s,c| s + c}
|
62
|
+
|
63
|
+
# the final weighted average
|
64
|
+
(weight * assumed_prob + totals * basic_prob) / (weight + totals)
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
|
2
|
+
class StuffClassifier::Bayes < StuffClassifier::Base
|
3
|
+
# http://en.wikipedia.org/wiki/Naive_Bayes_classifier
|
4
|
+
|
5
|
+
def initialize(name, opts={})
|
6
|
+
super(name, opts)
|
7
|
+
@thresholds = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def doc_prob(text, category)
|
11
|
+
each_word(text).map {|w|
|
12
|
+
word_weighted_average(w, category)
|
13
|
+
}.inject(1) {|p,c| p * c}
|
14
|
+
end
|
15
|
+
|
16
|
+
def text_prob(text, category)
|
17
|
+
cat_prob = cat_count(category) / total_count
|
18
|
+
doc_prob = doc_prob(text, category)
|
19
|
+
cat_prob * doc_prob
|
20
|
+
end
|
21
|
+
|
22
|
+
def cat_scores(text)
|
23
|
+
probs = {}
|
24
|
+
categories.each do |cat|
|
25
|
+
probs[cat] = text_prob(text, cat)
|
26
|
+
end
|
27
|
+
probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
|
28
|
+
end
|
29
|
+
|
30
|
+
def classify(text, default=nil)
|
31
|
+
# Find the category with the highest probability
|
32
|
+
max_prob = 0.0
|
33
|
+
best = nil
|
34
|
+
|
35
|
+
scores = cat_scores(text)
|
36
|
+
scores.each do |score|
|
37
|
+
cat, prob = score
|
38
|
+
if prob > max_prob
|
39
|
+
max_prob = prob
|
40
|
+
best = cat
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
return default unless best
|
45
|
+
threshold = @thresholds[best] || 1.0
|
46
|
+
|
47
|
+
scores.each do |score|
|
48
|
+
cat, prob = score
|
49
|
+
next if cat == best
|
50
|
+
return default if prob * threshold > max_prob
|
51
|
+
end
|
52
|
+
|
53
|
+
return best
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
StuffClassifier::STOP_WORDS = Set.new [
|
2
|
+
'a', 'about', 'above', 'across', 'after', 'afterwards',
|
3
|
+
'again', 'against', 'all', 'almost', 'alone', 'along',
|
4
|
+
'already', 'also', 'although', 'always', 'am', 'among',
|
5
|
+
'amongst', 'amoungst', 'amount', 'an', 'and', 'another',
|
6
|
+
'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere',
|
7
|
+
'are', 'around', 'as', 'at', 'back', 'be',
|
8
|
+
'became', 'because', 'become', 'becomes', 'becoming', 'been',
|
9
|
+
'before', 'beforehand', 'behind', 'being', 'below', 'beside',
|
10
|
+
'besides', 'between', 'beyond', 'bill', 'both', 'bottom',
|
11
|
+
'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
|
12
|
+
'co', 'computer', 'con', 'could', 'couldnt', 'cry',
|
13
|
+
'de', 'describe', 'detail', 'do', 'done', 'down',
|
14
|
+
'due', 'during', 'each', 'eg', 'eight', 'either',
|
15
|
+
'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every',
|
16
|
+
'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen',
|
17
|
+
'fify', 'fill', 'find', 'fire', 'first', 'five',
|
18
|
+
'for', 'former', 'formerly', 'forty', 'found', 'four',
|
19
|
+
'from', 'front', 'full', 'further', 'get', 'give',
|
20
|
+
'go', 'had', 'has', 'hasnt', 'have', 'he',
|
21
|
+
'hence', 'her', 'here', 'hereafter', 'hereby', 'herein',
|
22
|
+
'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
|
23
|
+
'how', 'however', 'hundred', 'i', 'ie', 'if',
|
24
|
+
'in', 'inc', 'indeed', 'interest', 'into', 'is',
|
25
|
+
'it', 'its', 'itself', 'keep', 'last', 'latter',
|
26
|
+
'latterly', 'least', 'less', 'ltd', 'made', 'many',
|
27
|
+
'may', 'me', 'meanwhile', 'might', 'mill', 'mine',
|
28
|
+
'more', 'moreover', 'most', 'mostly', 'move', 'much',
|
29
|
+
'must', 'my', 'myself', 'name', 'namely', 'neither',
|
30
|
+
'never', 'nevertheless', 'next', 'nine', 'no', 'nobody',
|
31
|
+
'none', 'noone', 'nor', 'not', 'nothing', 'now',
|
32
|
+
'nowhere', 'of', 'off', 'often', 'on', 'once',
|
33
|
+
'one', 'only', 'onto', 'or', 'other', 'others',
|
34
|
+
'otherwise', 'our', 'ours', 'ourselves', 'out', 'over',
|
35
|
+
'own', 'part', 'per', 'perhaps', 'please', 'put',
|
36
|
+
'rather', 're', 'same', 'see', 'seem', 'seemed',
|
37
|
+
'seeming', 'seems', 'serious', 'several', 'she', 'should',
|
38
|
+
'show', 'side', 'since', 'sincere', 'six', 'sixty',
|
39
|
+
'so', 'some', 'somehow', 'someone', 'something', 'sometime',
|
40
|
+
'sometimes', 'somewhere', 'still', 'such', 'system', 'take',
|
41
|
+
'ten', 'than', 'that', 'the', 'their', 'them',
|
42
|
+
'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby',
|
43
|
+
'therefore', 'therein', 'thereupon', 'these', 'they', 'thick',
|
44
|
+
'thin', 'third', 'this', 'those', 'though', 'three',
|
45
|
+
'through', 'throughout', 'thru', 'thus', 'to', 'together',
|
46
|
+
'too', 'top', 'toward', 'towards', 'twelve', 'twenty',
|
47
|
+
'two', 'un', 'under', 'until', 'up', 'upon',
|
48
|
+
'us', 'very', 'via', 'was', 'we', 'well',
|
49
|
+
'were', 'what', 'whatever', 'when', 'whence', 'whenever',
|
50
|
+
'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
|
51
|
+
'wherever', 'whether', 'which', 'while', 'whither', 'who',
|
52
|
+
'whoever', 'whole', 'whom', 'whose', 'why', 'will',
|
53
|
+
'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
|
54
|
+
'yourself', 'yourselves'
|
55
|
+
]
|
@@ -0,0 +1,41 @@
|
|
1
|
+
class StuffClassifier::TfIdf < StuffClassifier::Base
|
2
|
+
def tf_idf(word, cat)
|
3
|
+
word_cat_nr = word_count(word, cat)
|
4
|
+
cat_nr = cat_count(cat)
|
5
|
+
tf = 1.0 * word_cat_nr / cat_nr
|
6
|
+
|
7
|
+
total_categories = categories.length
|
8
|
+
categories_with_word = (@wcount[word] || []).length
|
9
|
+
|
10
|
+
idf = Math.log((total_categories + 2) / (categories_with_word + 1.0), 10)
|
11
|
+
return tf * idf
|
12
|
+
end
|
13
|
+
|
14
|
+
def text_prob(text, cat)
|
15
|
+
each_word(text).map{|w| tf_idf(w, cat)}.inject(0){|s,p| s + p}
|
16
|
+
end
|
17
|
+
|
18
|
+
def cat_scores(text)
|
19
|
+
probs = {}
|
20
|
+
categories.each do |cat|
|
21
|
+
p = text_prob(text, cat)
|
22
|
+
probs[cat] = p
|
23
|
+
end
|
24
|
+
probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
|
25
|
+
end
|
26
|
+
|
27
|
+
def classify(text, default=nil)
|
28
|
+
max_prob = 0.0
|
29
|
+
best = nil
|
30
|
+
|
31
|
+
cat_scores(text).each do |score|
|
32
|
+
cat, prob = score
|
33
|
+
if prob > max_prob
|
34
|
+
max_prob = prob
|
35
|
+
best = cat
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
max_prob > 0 ? best : default
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'fast_stemmer'
|
2
|
+
|
3
|
+
module StuffClassifier::Tokenizer
|
4
|
+
|
5
|
+
def ignore_words=(value)
|
6
|
+
@ignore_words = value
|
7
|
+
end
|
8
|
+
|
9
|
+
def ignore_words
|
10
|
+
@ignore_words || StuffClassifier::STOP_WORDS
|
11
|
+
end
|
12
|
+
|
13
|
+
def stemming?
|
14
|
+
defined?(@stemming) ? @stemming : false
|
15
|
+
end
|
16
|
+
|
17
|
+
def stemming=(value)
|
18
|
+
@stemming = value
|
19
|
+
end
|
20
|
+
|
21
|
+
def each_word(string)
|
22
|
+
string = string.strip
|
23
|
+
return if string == ''
|
24
|
+
|
25
|
+
words = []
|
26
|
+
|
27
|
+
cnt = string.gsub(/['`]/, '')
|
28
|
+
cnt.split("\n").each do |line|
|
29
|
+
line_cnt = line.gsub(/[^a-zA-Z]+/, ' ')
|
30
|
+
line_cnt.split(/\s+/).each do |w|
|
31
|
+
next if w == '' || ignore_words.member?(w.downcase)
|
32
|
+
|
33
|
+
if stemming?
|
34
|
+
w = w.stem.downcase
|
35
|
+
next if ignore_words.member?(w)
|
36
|
+
else
|
37
|
+
w = w.downcase
|
38
|
+
end
|
39
|
+
|
40
|
+
yield w if block_given?
|
41
|
+
words << w
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
return words
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "stuff-classifier/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "stuff-classifier"
|
7
|
+
s.version = StuffClassifier::VERSION
|
8
|
+
s.authors = ["Alexandru Nedelcu"]
|
9
|
+
s.email = ["me@alexn.org"]
|
10
|
+
s.homepage = "https://github.com/alexandru/stuff-classifier/"
|
11
|
+
s.summary = %q{Simple text classifier(s) implemetation}
|
12
|
+
s.description = %q{2 methods are provided for now - (1) naive bayes implementation + (2) tf-idf weights}
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
|
19
|
+
s.add_runtime_dependency "fast-stemmer", ">= 1.0"
|
20
|
+
|
21
|
+
s.add_development_dependency "bundler"
|
22
|
+
s.add_development_dependency "rake", ">= 0.9.2"
|
23
|
+
s.add_development_dependency "minitest", ">= 2.10"
|
24
|
+
s.add_development_dependency "turn", ">= 0.8.3"
|
25
|
+
s.add_development_dependency "rdoc", ">= 3.1"
|
26
|
+
s.add_development_dependency "rcov", ">= 0.9"
|
27
|
+
s.add_development_dependency "ruby-debug19"
|
28
|
+
end
|
29
|
+
|
data/test/helper.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'stuff-classifier'
|
3
|
+
require 'turn'
|
4
|
+
|
5
|
+
Turn.config do |c|
|
6
|
+
# use one of output formats:
|
7
|
+
# :outline - turn's original case/test outline mode [default]
|
8
|
+
# :progress - indicates progress with progress bar
|
9
|
+
# :dotted - test/unit's traditional dot-progress mode
|
10
|
+
# :pretty - new pretty reporter
|
11
|
+
# :marshal - dump output as YAML (normal run mode only)
|
12
|
+
# :cue - interactive testing
|
13
|
+
c.format = :outline
|
14
|
+
# turn on invoke/execute tracing, enable full backtrace
|
15
|
+
c.trace = true
|
16
|
+
# use humanized test names (works only with :outline format)
|
17
|
+
c.natural = true
|
18
|
+
end
|
19
|
+
|
20
|
+
class TestBase < MiniTest::Unit::TestCase
|
21
|
+
def self.before(&block)
|
22
|
+
@on_setup = block if block
|
23
|
+
@on_setup
|
24
|
+
end
|
25
|
+
|
26
|
+
def setup
|
27
|
+
on_setup = self.class.before
|
28
|
+
instance_eval(&on_setup) if on_setup
|
29
|
+
end
|
30
|
+
|
31
|
+
def set_classifier(instance)
|
32
|
+
@classifier = instance
|
33
|
+
end
|
34
|
+
|
35
|
+
def train(category, value)
|
36
|
+
@classifier.train(category, value)
|
37
|
+
end
|
38
|
+
|
39
|
+
def should_be(category, value)
|
40
|
+
assert_equal category, @classifier.classify(value), value
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class Test001Tokenizer < TestBase
|
4
|
+
before do
|
5
|
+
tokenizer_cls = Class.new do
|
6
|
+
include StuffClassifier::Tokenizer
|
7
|
+
end
|
8
|
+
|
9
|
+
@tokenizer = tokenizer_cls.new
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_simple_tokens
|
13
|
+
assert_equal ["hello", "world"],
|
14
|
+
@tokenizer.each_word('Hello world! How are you?')
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_with_stemming
|
18
|
+
@tokenizer.stemming = true
|
19
|
+
assert_equal(
|
20
|
+
["lot", "dog", "lot", "cat", "inform", "highwai"],
|
21
|
+
@tokenizer.each_word('Lots of dogs, lots of cats! This is the information highway')
|
22
|
+
)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_complicated_tokens
|
26
|
+
words = @tokenizer.each_word("I don't really get what you want to
|
27
|
+
accomplish. There is a class TestEval2, you can do test_eval2 =
|
28
|
+
TestEval2.new afterwards. And: class A ... end always yields nil, so
|
29
|
+
your output is ok I guess ;-)")
|
30
|
+
|
31
|
+
should_return = [
|
32
|
+
"really", "want", "accomplish", "class",
|
33
|
+
"testeval", "test", "eval", "testeval", "new", "class", "end",
|
34
|
+
"yields", "nil", "output", "ok", "guess"]
|
35
|
+
|
36
|
+
assert_equal should_return, words
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
|
4
|
+
class Test002NaiveBayesClassification < TestBase
|
5
|
+
before do
|
6
|
+
set_classifier StuffClassifier::Bayes.new("Cats or Dogs")
|
7
|
+
|
8
|
+
train :dog, "Dogs are awesome, cats too. I love my dog"
|
9
|
+
train :cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog"
|
10
|
+
train :dog, "My dog's name is Willy. He likes to play with my wife's cat all day long. I love dogs"
|
11
|
+
train :cat, "Cats are difficult animals, unlike dogs, really annoying, I hate them all"
|
12
|
+
train :dog, "So which one should you choose? A dog, definitely."
|
13
|
+
train :cat, "The favorite food for cats is bird meat, although mice are good, but birds are a delicacy"
|
14
|
+
train :dog, "A dog will eat anything, including birds or whatever meat"
|
15
|
+
train :cat, "My cat's favorite place to purr is on my keyboard"
|
16
|
+
train :dog, "My dog's favorite place to take a leak is the tree in front of our house"
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_for_cats
|
20
|
+
should_be :cat, "This test is about cats."
|
21
|
+
should_be :cat, "I hate ..."
|
22
|
+
should_be :cat, "The most annoying animal on earth."
|
23
|
+
should_be :cat, "The preferred company of software developers."
|
24
|
+
should_be :cat, "My precious, my favorite!"
|
25
|
+
should_be :cat, "Kill that bird!"
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_for_dogs
|
29
|
+
should_be :dog, "This test is about dogs."
|
30
|
+
should_be :dog, "Cats or Dogs?"
|
31
|
+
should_be :dog, "What pet will I love more?"
|
32
|
+
should_be :dog, "Willy, where the heck are you?"
|
33
|
+
should_be :dog, "I like big buts and I cannot lie."
|
34
|
+
should_be :dog, "Why is the front door of our house open?"
|
35
|
+
should_be :dog, "Who ate my meat?"
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
|
4
|
+
class Test003TfIdfClassification < TestBase
|
5
|
+
before do
|
6
|
+
set_classifier StuffClassifier::TfIdf.new("Cats or Dogs")
|
7
|
+
|
8
|
+
train :dog, "Dogs are awesome, cats too. I love my dog"
|
9
|
+
train :cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog"
|
10
|
+
train :dog, "My dog's name is Willy. He likes to play with my wife's cat all day long. I love dogs"
|
11
|
+
train :cat, "Cats are difficult animals, unlike dogs, really annoying, I hate them all"
|
12
|
+
train :dog, "So which one should you choose? A dog, definitely."
|
13
|
+
train :cat, "The favorite food for cats is bird meat, although mice are good, but birds are a delicacy"
|
14
|
+
train :dog, "A dog will eat anything, including birds or whatever meat"
|
15
|
+
train :cat, "My cat's favorite place to purr is on my keyboard"
|
16
|
+
train :dog, "My dog's favorite place to take a leak is the tree in front of our house"
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_for_cats
|
20
|
+
should_be :cat, "This test is about cats."
|
21
|
+
should_be :cat, "I hate ..."
|
22
|
+
should_be :cat, "The most annoying animal on earth."
|
23
|
+
should_be :cat, "The preferred company of software developers."
|
24
|
+
should_be :cat, "My precious, my favorite!"
|
25
|
+
should_be :cat, "Kill that bird!"
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_for_dogs
|
29
|
+
should_be :dog, "This test is about dogs."
|
30
|
+
should_be :dog, "Cats or Dogs?"
|
31
|
+
should_be :dog, "What pet will I love more?"
|
32
|
+
should_be :dog, "Willy, where the heck are you?"
|
33
|
+
should_be :dog, "I like big buts and I cannot lie."
|
34
|
+
should_be :dog, "Why is the front door of our house open?"
|
35
|
+
should_be :dog, "Who is eating my meat?"
|
36
|
+
end
|
37
|
+
end
|
metadata
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: stuff-classifier
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Alexandru Nedelcu
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-01-19 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: fast-stemmer
|
16
|
+
requirement: &71035640 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *71035640
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: bundler
|
27
|
+
requirement: &71096090 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *71096090
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rake
|
38
|
+
requirement: &71095820 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.9.2
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *71095820
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: minitest
|
49
|
+
requirement: &71095570 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '2.10'
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *71095570
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: turn
|
60
|
+
requirement: &71095340 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: 0.8.3
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *71095340
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rdoc
|
71
|
+
requirement: &71095110 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '3.1'
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: *71095110
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: rcov
|
82
|
+
requirement: &71094880 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0.9'
|
88
|
+
type: :development
|
89
|
+
prerelease: false
|
90
|
+
version_requirements: *71094880
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: ruby-debug19
|
93
|
+
requirement: &71094690 !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
95
|
+
requirements:
|
96
|
+
- - ! '>='
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
type: :development
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: *71094690
|
102
|
+
description: 2 methods are provided for now - (1) naive bayes implementation + (2)
|
103
|
+
tf-idf weights
|
104
|
+
email:
|
105
|
+
- me@alexn.org
|
106
|
+
executables: []
|
107
|
+
extensions: []
|
108
|
+
extra_rdoc_files: []
|
109
|
+
files:
|
110
|
+
- .gitignore
|
111
|
+
- Gemfile
|
112
|
+
- Gemfile.lock
|
113
|
+
- LICENSE.txt
|
114
|
+
- README.md
|
115
|
+
- Rakefile
|
116
|
+
- lib/stuff-classifier.rb
|
117
|
+
- lib/stuff-classifier/base.rb
|
118
|
+
- lib/stuff-classifier/bayes.rb
|
119
|
+
- lib/stuff-classifier/stop_words.rb
|
120
|
+
- lib/stuff-classifier/tf-idf.rb
|
121
|
+
- lib/stuff-classifier/tokenizer.rb
|
122
|
+
- lib/stuff-classifier/version.rb
|
123
|
+
- stuff-classifier.gemspec
|
124
|
+
- test/helper.rb
|
125
|
+
- test/test_001_tokenizer.rb
|
126
|
+
- test/test_002_naive_bayes.rb
|
127
|
+
- test/test_003_tf_idf.rb
|
128
|
+
homepage: https://github.com/alexandru/stuff-classifier/
|
129
|
+
licenses: []
|
130
|
+
post_install_message:
|
131
|
+
rdoc_options: []
|
132
|
+
require_paths:
|
133
|
+
- lib
|
134
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
135
|
+
none: false
|
136
|
+
requirements:
|
137
|
+
- - ! '>='
|
138
|
+
- !ruby/object:Gem::Version
|
139
|
+
version: '0'
|
140
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
141
|
+
none: false
|
142
|
+
requirements:
|
143
|
+
- - ! '>='
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
requirements: []
|
147
|
+
rubyforge_project:
|
148
|
+
rubygems_version: 1.8.6
|
149
|
+
signing_key:
|
150
|
+
specification_version: 3
|
151
|
+
summary: Simple text classifier(s) implemetation
|
152
|
+
test_files: []
|