classified 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +27 -0
- data/Gemfile.lock +34 -0
- data/LICENSE.txt +5 -0
- data/README.markdown +35 -0
- data/Rakefile +55 -0
- data/VERSION +1 -0
- data/classified.gemspec +72 -0
- data/lib/classified.rb +33 -0
- data/lib/classifiers/ankusa.rb +39 -0
- data/lib/classifiers/base.rb +12 -0
- data/lib/classifiers/classifier_bayes.rb +30 -0
- data/lib/classifiers/classifier_lsi.rb +26 -0
- data/lib/classifiers/hoatzin.rb +28 -0
- data/lib/corpus/utils.rb +80 -0
- data/lib/metrics.rb +76 -0
- data/lib/tasks/metrics.rake +37 -0
- data/test/corpora/movie_reviews/movie_reviews.zip +0 -0
- data/test/helper.rb +21 -0
- data/test/test_classified.rb +31 -0
- metadata +146 -0
data/Gemfile
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
|
7
|
+
# Add dependencies to develop your gem here.
|
8
|
+
# Include everything needed to run rake, tests, features, etc.
|
9
|
+
group :development do
|
10
|
+
gem "shoulda", ">= 0"
|
11
|
+
gem "bundler", "~> 1.0.0"
|
12
|
+
gem "jeweler", "~> 1.5.2"
|
13
|
+
gem "rcov", ">= 0"
|
14
|
+
end
|
15
|
+
|
16
|
+
group :test do
|
17
|
+
gem "libarchive"
|
18
|
+
gem "ankusa"
|
19
|
+
end
|
20
|
+
|
21
|
+
group :metrics do
|
22
|
+
gem "libarchive"
|
23
|
+
gem "ankusa"
|
24
|
+
gem "hoatzin", "0.1.0"
|
25
|
+
gem "classifier"
|
26
|
+
end
|
27
|
+
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
ankusa (0.0.8)
|
5
|
+
fast-stemmer (>= 1.0.0)
|
6
|
+
classifier (1.3.3)
|
7
|
+
fast-stemmer (>= 1.0.0)
|
8
|
+
fast-stemmer (1.0.0)
|
9
|
+
git (1.2.5)
|
10
|
+
hoatzin (0.1.0)
|
11
|
+
fast-stemmer
|
12
|
+
libsvm-ruby-swig
|
13
|
+
jeweler (1.5.2)
|
14
|
+
bundler (~> 1.0.0)
|
15
|
+
git (>= 1.2.5)
|
16
|
+
rake
|
17
|
+
libarchive (0.1.2)
|
18
|
+
libsvm-ruby-swig (0.4.0)
|
19
|
+
rake (0.8.7)
|
20
|
+
rcov (0.9.9)
|
21
|
+
shoulda (2.11.3)
|
22
|
+
|
23
|
+
PLATFORMS
|
24
|
+
ruby
|
25
|
+
|
26
|
+
DEPENDENCIES
|
27
|
+
ankusa
|
28
|
+
bundler (~> 1.0.0)
|
29
|
+
classifier
|
30
|
+
hoatzin (= 0.1.0)
|
31
|
+
jeweler (~> 1.5.2)
|
32
|
+
libarchive
|
33
|
+
rcov
|
34
|
+
shoulda
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,5 @@
|
|
1
|
+
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
|
2
|
+
|
3
|
+
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
4
|
+
|
5
|
+
You should have received a copy of the GNU General Public License along with this program. If not, see <www.gnu.org/licenses/>
|
data/README.markdown
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# classified - classifier abstraction and comparison framework
|
2
|
+
|
3
|
+
## Description
|
4
|
+
|
5
|
+
classified provides an abstract interface to common ruby classifiers. It allows comparison of these classifiers using common corpora to compare accuracy, precision, recall and f-measure metrics.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
gem install classified
|
10
|
+
bundle install
|
11
|
+
|
12
|
+
NOTE: You will need to have installed the libarchive library to support the decompression of the corpora used in the comparisons.
|
13
|
+
|
14
|
+
## Usage:
|
15
|
+
|
16
|
+
bundle exec rake metrics:all
|
17
|
+
|
18
|
+
## Supported classifiers
|
19
|
+
|
20
|
+
* ankusa - https://github.com/livingsocial/ankusa
|
21
|
+
* hoatzin - https://github.com/rattle/hoatzin
|
22
|
+
* classifier (bayes + lsi) - https://github.com/cardmagic/classifier
|
23
|
+
|
24
|
+
## Analysis
|
25
|
+
|
26
|
+
For further information on interpreting the results of the comparison, see http://streamhacker.com/2010/05/17/text-classification-sentiment-analysis-precision-recall/
|
27
|
+
|
28
|
+
## Further reading and acknowledgements
|
29
|
+
|
30
|
+
See http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/ for the original inspiration.
|
31
|
+
|
32
|
+
## Copyright and License
|
33
|
+
|
34
|
+
GPL v3 - See LICENSE.txt for details.
|
35
|
+
Copyright (c) 2010, Rob Lee
|
data/Rakefile
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'rake'
|
11
|
+
|
12
|
+
Dir[File.dirname(__FILE__)+"/lib/tasks/*.rake"].sort.each { |ext| load ext }
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "classified"
|
18
|
+
gem.homepage = "http://github.com/rjlee/classified"
|
19
|
+
gem.license = "GPLv3"
|
20
|
+
gem.summary = %Q{classifier abstraction and comparison framework}
|
21
|
+
gem.description = %Q{classified provides an abstract interface to common ruby classifiers. It allows comparison of these classifiers using common corpora to compare accuracy, precision, recall and f-measure metrics.}
|
22
|
+
gem.email = "robl[at]rjlee.net"
|
23
|
+
gem.authors = ["Rob Lee"]
|
24
|
+
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
25
|
+
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
26
|
+
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
27
|
+
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
28
|
+
end
|
29
|
+
Jeweler::RubygemsDotOrgTasks.new
|
30
|
+
|
31
|
+
require 'rake/testtask'
|
32
|
+
Rake::TestTask.new(:test) do |test|
|
33
|
+
test.libs << 'lib' << 'test'
|
34
|
+
test.pattern = 'test/**/test_*.rb'
|
35
|
+
test.verbose = true
|
36
|
+
end
|
37
|
+
|
38
|
+
require 'rcov/rcovtask'
|
39
|
+
Rcov::RcovTask.new do |test|
|
40
|
+
test.libs << 'test'
|
41
|
+
test.pattern = 'test/**/test_*.rb'
|
42
|
+
test.verbose = true
|
43
|
+
end
|
44
|
+
|
45
|
+
task :default => :test
|
46
|
+
|
47
|
+
require 'rake/rdoctask'
|
48
|
+
Rake::RDocTask.new do |rdoc|
|
49
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
50
|
+
|
51
|
+
rdoc.rdoc_dir = 'rdoc'
|
52
|
+
rdoc.title = "classified #{version}"
|
53
|
+
rdoc.rdoc_files.include('README*')
|
54
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
55
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.0
|
data/classified.gemspec
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{classified}
|
8
|
+
s.version = "0.0.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Rob Lee"]
|
12
|
+
s.date = %q{2011-01-07}
|
13
|
+
s.description = %q{classified provides an abstract interface to common ruby classifiers. It allows comparison of these classifiers using common corpora to compare accuracy, precision, recall and f-measure metrics.}
|
14
|
+
s.email = %q{robl[at]rjlee.net}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README.markdown"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
"Gemfile",
|
21
|
+
"Gemfile.lock",
|
22
|
+
"LICENSE.txt",
|
23
|
+
"README.markdown",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"classified.gemspec",
|
27
|
+
"lib/classified.rb",
|
28
|
+
"lib/classifiers/ankusa.rb",
|
29
|
+
"lib/classifiers/base.rb",
|
30
|
+
"lib/classifiers/classifier_bayes.rb",
|
31
|
+
"lib/classifiers/classifier_lsi.rb",
|
32
|
+
"lib/classifiers/hoatzin.rb",
|
33
|
+
"lib/corpus/utils.rb",
|
34
|
+
"lib/metrics.rb",
|
35
|
+
"lib/tasks/metrics.rake",
|
36
|
+
"test/corpora/movie_reviews/movie_reviews.zip",
|
37
|
+
"test/helper.rb",
|
38
|
+
"test/test_classified.rb"
|
39
|
+
]
|
40
|
+
s.homepage = %q{http://github.com/rjlee/classified}
|
41
|
+
s.licenses = ["GPLv3"]
|
42
|
+
s.require_paths = ["lib"]
|
43
|
+
s.rubygems_version = %q{1.3.7}
|
44
|
+
s.summary = %q{classifier abstraction and comparison framework}
|
45
|
+
s.test_files = [
|
46
|
+
"test/helper.rb",
|
47
|
+
"test/test_classified.rb"
|
48
|
+
]
|
49
|
+
|
50
|
+
if s.respond_to? :specification_version then
|
51
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
52
|
+
s.specification_version = 3
|
53
|
+
|
54
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
55
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
56
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
57
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
58
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
59
|
+
else
|
60
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
61
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
62
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
63
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
64
|
+
end
|
65
|
+
else
|
66
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
67
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
68
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
69
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
data/lib/classified.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'corpus/utils'
|
2
|
+
require 'metrics'
|
3
|
+
require 'classifiers/base'
|
4
|
+
require 'classifiers/ankusa'
|
5
|
+
require 'classifiers/hoatzin'
|
6
|
+
require 'classifiers/classifier_bayes'
|
7
|
+
require 'classifiers/classifier_lsi'
|
8
|
+
|
9
|
+
module Classified
|
10
|
+
class Classifiers
|
11
|
+
|
12
|
+
class InvalidClassifier < Exception; end
|
13
|
+
|
14
|
+
VALID_CLASSIFIERS = [:classifier_bayes, :ankusa, :hoatzin] # :classifier_lsi
|
15
|
+
|
16
|
+
def self.create options = {}
|
17
|
+
|
18
|
+
options = { :classifier => :ankusa }.merge!(options)
|
19
|
+
raise InvalidClassifier unless VALID_CLASSIFIERS.include?(options[:classifier])
|
20
|
+
|
21
|
+
case options[:classifier]
|
22
|
+
when :ankusa
|
23
|
+
return Classified::Ankusa.new(options)
|
24
|
+
when :hoatzin
|
25
|
+
return Classified::Hoatzin.new(options)
|
26
|
+
when :classifier_bayes
|
27
|
+
return Classified::ClassifierBayes.new(options)
|
28
|
+
when :classifier_lsi
|
29
|
+
return Classified::ClassifierLSI.new(options)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
if Gem.available?('ankusa')
|
2
|
+
require 'ankusa'
|
3
|
+
require 'ankusa/file_system_storage'
|
4
|
+
end
|
5
|
+
|
6
|
+
module Classified
|
7
|
+
class Ankusa < Base
|
8
|
+
attr_accessor :classifier
|
9
|
+
def initialize options = {}
|
10
|
+
case options.delete(:storage)
|
11
|
+
when :memory
|
12
|
+
@storage = ::Ankusa::MemoryStorage.new
|
13
|
+
when :file
|
14
|
+
@storage = ::Ankusa::FileSystemStorage.new options.delete(:file)
|
15
|
+
else
|
16
|
+
@storage = ::Ankusa::MemoryStorage.new
|
17
|
+
end
|
18
|
+
@classifier = ::Ankusa::NaiveBayesClassifier.new @storage
|
19
|
+
end
|
20
|
+
|
21
|
+
def train classification, text
|
22
|
+
@classifier.train classification, text
|
23
|
+
end
|
24
|
+
|
25
|
+
def classify text
|
26
|
+
@classifier.classify text
|
27
|
+
end
|
28
|
+
|
29
|
+
def classifications
|
30
|
+
@classifier.classnames
|
31
|
+
end
|
32
|
+
|
33
|
+
def save
|
34
|
+
# TODO: confirm this is a support method for the classifier
|
35
|
+
@storage.save
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'classifier' if Gem.available?('classifier')
|
2
|
+
|
3
|
+
module Classified
|
4
|
+
class ClassifierBayes < Base
|
5
|
+
attr_accessor :classifier, :options
|
6
|
+
def initialize options = {}
|
7
|
+
@options = { :classes => ['Positive', 'Negative']}.merge!(options)
|
8
|
+
@classifier = ::Classifier::Bayes.new 'Positive', 'Negative'
|
9
|
+
#@classifier = ::Classifier::Bayes.new options[:classes]
|
10
|
+
end
|
11
|
+
|
12
|
+
def train classification, text
|
13
|
+
@classifier.send("train_#{classification.to_s}", text)
|
14
|
+
end
|
15
|
+
|
16
|
+
def classify text
|
17
|
+
@classifier.classify text
|
18
|
+
end
|
19
|
+
|
20
|
+
def classifications
|
21
|
+
@options[:classes]
|
22
|
+
end
|
23
|
+
|
24
|
+
def transform text
|
25
|
+
text.to_s.capitalize
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'classifier' if Gem.available?('classifier')
|
2
|
+
|
3
|
+
module Classified
|
4
|
+
class ClassifierLSI < Base
|
5
|
+
attr_accessor :classifier, :options
|
6
|
+
def initialize options = {}
|
7
|
+
@options = { :classes => [:positive, :negative]}.merge!(options)
|
8
|
+
@classifier = ::Classifier::LSI.new
|
9
|
+
#@classifier = ::Classifier::Bayes.new options[:classes]
|
10
|
+
end
|
11
|
+
|
12
|
+
def train classification, text
|
13
|
+
@classifier.add_item(text, "train_#{classification.to_s}")
|
14
|
+
end
|
15
|
+
|
16
|
+
def classify text
|
17
|
+
@classifier.classify text
|
18
|
+
end
|
19
|
+
|
20
|
+
def classifications
|
21
|
+
@options[:classes]
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'hoatzin' if Gem.available?('hoatzin')
|
2
|
+
|
3
|
+
module Classified
|
4
|
+
class Hoatzin < Base
|
5
|
+
attr_accessor :classifier
|
6
|
+
def initialize options = {}
|
7
|
+
@classifier = ::Hoatzin::Classifier.new options
|
8
|
+
end
|
9
|
+
|
10
|
+
def train classification, text
|
11
|
+
@classifier.train classification, text
|
12
|
+
end
|
13
|
+
|
14
|
+
def classify text
|
15
|
+
@classifier.classify text
|
16
|
+
end
|
17
|
+
|
18
|
+
def classifications
|
19
|
+
@classifier.classifications
|
20
|
+
end
|
21
|
+
|
22
|
+
def save options = {}
|
23
|
+
# TODO: confirm this is a support method for the classifier
|
24
|
+
@classifier.save options
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
data/lib/corpus/utils.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'libarchive' if Gem.available?('libarchive')
|
2
|
+
|
3
|
+
module Classified
|
4
|
+
module Corpus
|
5
|
+
class Utils
|
6
|
+
|
7
|
+
def self.load_movie_reviews
|
8
|
+
# See if we've already unpacked them
|
9
|
+
base_dir = File.join(File.dirname(__FILE__), '..', '..', 'test', 'corpora', 'movie_reviews')
|
10
|
+
classification_count = { :negative => 0, :positive => 0}
|
11
|
+
movie_review_corpus = []
|
12
|
+
if File.directory?(File.join(base_dir, 'movie_reviews', 'neg'))
|
13
|
+
['negative', 'positive'].each do |classification|
|
14
|
+
dir = File.join(base_dir, 'movie_reviews', classification[0,3])
|
15
|
+
Dir[dir + "/*.txt"].each do |filename|
|
16
|
+
movie_review_corpus << {:classification => classification.to_sym, :text => File.read(filename) }
|
17
|
+
classification_count[classification.to_sym] +=1
|
18
|
+
end
|
19
|
+
end
|
20
|
+
else
|
21
|
+
::Archive.read_open_filename(File.join(base_dir, 'movie_reviews.zip')) do |ar|
|
22
|
+
while entry = ar.next_header
|
23
|
+
name = entry.pathname
|
24
|
+
if name =~ /\/$/
|
25
|
+
FileUtils.mkdir_p File.join(base_dir, name)
|
26
|
+
next
|
27
|
+
end
|
28
|
+
classification = name =~ /neg/ ? :negative : :positive
|
29
|
+
text = ar.read_data
|
30
|
+
movie_review_corpus << { :classification => classification, :text => text }
|
31
|
+
file = File.join(base_dir, name)
|
32
|
+
File.open(file, 'w') {|f| f.write(text) }
|
33
|
+
classification_count[classification.to_sym] +=1
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
movie_review = { :training => [], :test => [] }
|
39
|
+
count = { :positive => 0, :negative => 0}
|
40
|
+
movie_review_corpus.each do |doc|
|
41
|
+
count[doc[:classification]]+=1
|
42
|
+
if count[doc[:classification]] > classification_count[doc[:classification]]*3/4
|
43
|
+
# Add to test set
|
44
|
+
movie_review[:test] << doc
|
45
|
+
else
|
46
|
+
# Add to training set
|
47
|
+
movie_review[:training] << doc
|
48
|
+
end
|
49
|
+
end
|
50
|
+
movie_review
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.load_twitter_sentiment(file)
|
54
|
+
docs = []
|
55
|
+
classification_count = { :positive => 0, :negative => 0 }
|
56
|
+
File.open( file ) do |yf|
|
57
|
+
YAML.load_documents( yf ) do |status|
|
58
|
+
docs << { :text => status[:text], :classification => status[:classification] }
|
59
|
+
classification_count[status[:classification]] +=1
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
corpus = { :training => [], :test => [] }
|
64
|
+
count = { :positive => 0, :negative => 0}
|
65
|
+
docs.each do |doc|
|
66
|
+
count[doc[:classification]]+=1
|
67
|
+
if count[doc[:classification]] > classification_count[doc[:classification]]*3/4
|
68
|
+
# Add to test set
|
69
|
+
corpus[:test] << doc
|
70
|
+
else
|
71
|
+
# Add to training set
|
72
|
+
corpus[:training] << doc
|
73
|
+
end
|
74
|
+
end
|
75
|
+
corpus
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/metrics.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
module Classified
|
2
|
+
class Metrics
|
3
|
+
|
4
|
+
def self.train_classifier classifier, corpus
|
5
|
+
puts "Training"
|
6
|
+
start_time = Time.now
|
7
|
+
corpus.each do |doc|
|
8
|
+
classifier.train doc[:classification], doc[:text]
|
9
|
+
end
|
10
|
+
# Call a classify action to force an syncing in the classifier to occur
|
11
|
+
classifier.classify("It's rubbish")
|
12
|
+
#classifier.save(:metadata => './metadata', :model => './model')
|
13
|
+
puts "Took #{Time.now-start_time} secs : training #{corpus.length} docs"
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.test_classifier classifier, test_corpus
|
17
|
+
alpha = 0.5
|
18
|
+
total = correct = 0
|
19
|
+
metrics = {}
|
20
|
+
|
21
|
+
puts "Getting metrics"
|
22
|
+
start_time = Time.now
|
23
|
+
|
24
|
+
classifier.classifications.each do |c|
|
25
|
+
# tc - true class
|
26
|
+
# fc - false class
|
27
|
+
tc = c
|
28
|
+
fc = classifier.classifications.clone
|
29
|
+
fc.delete(tc)
|
30
|
+
fc = fc.first
|
31
|
+
metrics[tc] = { :precision => 0, :recall => 0, :fmeasure => 0 }
|
32
|
+
scores = {}
|
33
|
+
classifier.classifications.each do |classification|
|
34
|
+
scores[classification] = { :true => 0, :false => 0 }
|
35
|
+
end
|
36
|
+
test_corpus.each do |doc|
|
37
|
+
c = classifier.transform(doc[:classification])
|
38
|
+
if classifier.classify(doc[:text]) == c
|
39
|
+
scores[c][:true]+=1
|
40
|
+
else
|
41
|
+
scores[c][:false]+=1
|
42
|
+
end
|
43
|
+
end
|
44
|
+
correct += scores[tc][:true]
|
45
|
+
total += scores[tc][:true] + scores[tc][:false]
|
46
|
+
metrics[tc][:precision] = scores[tc][:true].to_f / (scores[tc][:true] + scores[tc][:false])
|
47
|
+
metrics[tc][:recall] = scores[tc][:true].to_f / (scores[tc][:true] + scores[fc][:false])
|
48
|
+
metrics[tc][:fmeasure] = 1.0/(alpha/metrics[tc][:precision] + (1-alpha)/metrics[tc][:recall])
|
49
|
+
end
|
50
|
+
|
51
|
+
metrics[:correct] = correct
|
52
|
+
metrics[:total] = total
|
53
|
+
metrics[:accuracy] = correct.to_f / total
|
54
|
+
metrics[:duration] = Time.now-start_time
|
55
|
+
|
56
|
+
puts "Took #{metrics[:duration]} secs"
|
57
|
+
|
58
|
+
metrics
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.summary(classifier, metrics)
|
62
|
+
puts "average classification time = #{metrics[:duration]/metrics[:total].to_f} secs"
|
63
|
+
puts "total classifications = #{metrics[:total]}"
|
64
|
+
puts "correct classifications = #{metrics[:correct]}"
|
65
|
+
puts "accuracy = #{metrics[:accuracy]}"
|
66
|
+
classifier.classifications.each do |classification|
|
67
|
+
puts "#{classification} classification metrics"
|
68
|
+
metrics[classification].each_key do |metric|
|
69
|
+
puts "\t#{metric} = #{metrics[classification][metric]}"
|
70
|
+
end
|
71
|
+
puts
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'pp'
|
2
|
+
require 'rubygems'
|
3
|
+
require 'rake'
|
4
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..' ))
|
5
|
+
require 'classified'
|
6
|
+
include Classified
|
7
|
+
|
8
|
+
namespace :metrics do
|
9
|
+
|
10
|
+
task :evaluate, :corpus do |t, args|
|
11
|
+
args.with_defaults(:corpus => 'movie_reviews')
|
12
|
+
puts "Loading corpus #{args[:corpus]}"
|
13
|
+
corpus = nil
|
14
|
+
case args[:corpus]
|
15
|
+
when 'movie_reviews'
|
16
|
+
corpus = Corpus::Utils.load_movie_reviews
|
17
|
+
when 'twitter_sentiment'
|
18
|
+
corpus = Corpus::Utils.load_twitter_sentiment
|
19
|
+
end
|
20
|
+
|
21
|
+
puts
|
22
|
+
Classifiers::VALID_CLASSIFIERS.each do |c|
|
23
|
+
classifier = Classifiers.create(:classifier => c)
|
24
|
+
puts "Analysing classifier: #{c}"
|
25
|
+
Metrics.train_classifier classifier, corpus[:training]
|
26
|
+
puts
|
27
|
+
metrics = Metrics.test_classifier classifier, corpus[:test]
|
28
|
+
puts
|
29
|
+
Metrics.summary(classifier, metrics)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
task :all do
|
34
|
+
Rake::Task['metrics:evaluate'].invoke('movie_reviews')
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
Binary file
|
data/test/helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'classified'
|
16
|
+
|
17
|
+
require 'libarchive'
|
18
|
+
|
19
|
+
class Test::Unit::TestCase
|
20
|
+
|
21
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require 'pp'
|
3
|
+
|
4
|
+
class TestClassified < Test::Unit::TestCase
|
5
|
+
|
6
|
+
context "With the default ankusa classifier" do
|
7
|
+
|
8
|
+
setup do
|
9
|
+
@c = Classified::Classifiers.create(:storage => :memory)
|
10
|
+
end
|
11
|
+
|
12
|
+
should "support training and classification" do
|
13
|
+
assert_equal Hash[:nice, 1], @c.train(:positive, "Thats nice")
|
14
|
+
assert_equal :positive, @c.classify("Thats nice")
|
15
|
+
end
|
16
|
+
|
17
|
+
context "with a trained moview review corpus" do
|
18
|
+
|
19
|
+
setup do
|
20
|
+
corpus = Classified::Corpus::Utils.load_movie_reviews
|
21
|
+
Classified::Metrics.train_classifier @c, corpus[:training]
|
22
|
+
end
|
23
|
+
|
24
|
+
should "correctly classify text" do
|
25
|
+
assert_equal :negative, @c.classify("It was rubbish")
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
metadata
ADDED
@@ -0,0 +1,146 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: classified
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 31
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 0
|
10
|
+
version: 0.0.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Rob Lee
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-01-07 00:00:00 +00:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
prerelease: false
|
23
|
+
name: shoulda
|
24
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
requirement: *id001
|
34
|
+
type: :development
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
prerelease: false
|
37
|
+
name: bundler
|
38
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 23
|
44
|
+
segments:
|
45
|
+
- 1
|
46
|
+
- 0
|
47
|
+
- 0
|
48
|
+
version: 1.0.0
|
49
|
+
requirement: *id002
|
50
|
+
type: :development
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
prerelease: false
|
53
|
+
name: jeweler
|
54
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ~>
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
hash: 7
|
60
|
+
segments:
|
61
|
+
- 1
|
62
|
+
- 5
|
63
|
+
- 2
|
64
|
+
version: 1.5.2
|
65
|
+
requirement: *id003
|
66
|
+
type: :development
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
prerelease: false
|
69
|
+
name: rcov
|
70
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 3
|
76
|
+
segments:
|
77
|
+
- 0
|
78
|
+
version: "0"
|
79
|
+
requirement: *id004
|
80
|
+
type: :development
|
81
|
+
description: classified provides an abstract interface to common ruby classifiers. It allows comparison of these classifiers using common corpora to compare accuracy, precision, recall and f-measure metrics.
|
82
|
+
email: robl[at]rjlee.net
|
83
|
+
executables: []
|
84
|
+
|
85
|
+
extensions: []
|
86
|
+
|
87
|
+
extra_rdoc_files:
|
88
|
+
- LICENSE.txt
|
89
|
+
- README.markdown
|
90
|
+
files:
|
91
|
+
- Gemfile
|
92
|
+
- Gemfile.lock
|
93
|
+
- LICENSE.txt
|
94
|
+
- README.markdown
|
95
|
+
- Rakefile
|
96
|
+
- VERSION
|
97
|
+
- classified.gemspec
|
98
|
+
- lib/classified.rb
|
99
|
+
- lib/classifiers/ankusa.rb
|
100
|
+
- lib/classifiers/base.rb
|
101
|
+
- lib/classifiers/classifier_bayes.rb
|
102
|
+
- lib/classifiers/classifier_lsi.rb
|
103
|
+
- lib/classifiers/hoatzin.rb
|
104
|
+
- lib/corpus/utils.rb
|
105
|
+
- lib/metrics.rb
|
106
|
+
- lib/tasks/metrics.rake
|
107
|
+
- test/corpora/movie_reviews/movie_reviews.zip
|
108
|
+
- test/helper.rb
|
109
|
+
- test/test_classified.rb
|
110
|
+
has_rdoc: true
|
111
|
+
homepage: http://github.com/rjlee/classified
|
112
|
+
licenses:
|
113
|
+
- GPLv3
|
114
|
+
post_install_message:
|
115
|
+
rdoc_options: []
|
116
|
+
|
117
|
+
require_paths:
|
118
|
+
- lib
|
119
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
120
|
+
none: false
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
hash: 3
|
125
|
+
segments:
|
126
|
+
- 0
|
127
|
+
version: "0"
|
128
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ">="
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
hash: 3
|
134
|
+
segments:
|
135
|
+
- 0
|
136
|
+
version: "0"
|
137
|
+
requirements: []
|
138
|
+
|
139
|
+
rubyforge_project:
|
140
|
+
rubygems_version: 1.3.7
|
141
|
+
signing_key:
|
142
|
+
specification_version: 3
|
143
|
+
summary: classifier abstraction and comparison framework
|
144
|
+
test_files:
|
145
|
+
- test/helper.rb
|
146
|
+
- test/test_classified.rb
|