classified 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +27 -0
- data/Gemfile.lock +34 -0
- data/LICENSE.txt +5 -0
- data/README.markdown +35 -0
- data/Rakefile +55 -0
- data/VERSION +1 -0
- data/classified.gemspec +72 -0
- data/lib/classified.rb +33 -0
- data/lib/classifiers/ankusa.rb +39 -0
- data/lib/classifiers/base.rb +12 -0
- data/lib/classifiers/classifier_bayes.rb +30 -0
- data/lib/classifiers/classifier_lsi.rb +26 -0
- data/lib/classifiers/hoatzin.rb +28 -0
- data/lib/corpus/utils.rb +80 -0
- data/lib/metrics.rb +76 -0
- data/lib/tasks/metrics.rake +37 -0
- data/test/corpora/movie_reviews/movie_reviews.zip +0 -0
- data/test/helper.rb +21 -0
- data/test/test_classified.rb +31 -0
- metadata +146 -0
data/Gemfile
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
|
7
|
+
# Add dependencies to develop your gem here.
|
8
|
+
# Include everything needed to run rake, tests, features, etc.
|
9
|
+
group :development do
|
10
|
+
gem "shoulda", ">= 0"
|
11
|
+
gem "bundler", "~> 1.0.0"
|
12
|
+
gem "jeweler", "~> 1.5.2"
|
13
|
+
gem "rcov", ">= 0"
|
14
|
+
end
|
15
|
+
|
16
|
+
group :test do
|
17
|
+
gem "libarchive"
|
18
|
+
gem "ankusa"
|
19
|
+
end
|
20
|
+
|
21
|
+
group :metrics do
|
22
|
+
gem "libarchive"
|
23
|
+
gem "ankusa"
|
24
|
+
gem "hoatzin", "0.1.0"
|
25
|
+
gem "classifier"
|
26
|
+
end
|
27
|
+
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
ankusa (0.0.8)
|
5
|
+
fast-stemmer (>= 1.0.0)
|
6
|
+
classifier (1.3.3)
|
7
|
+
fast-stemmer (>= 1.0.0)
|
8
|
+
fast-stemmer (1.0.0)
|
9
|
+
git (1.2.5)
|
10
|
+
hoatzin (0.1.0)
|
11
|
+
fast-stemmer
|
12
|
+
libsvm-ruby-swig
|
13
|
+
jeweler (1.5.2)
|
14
|
+
bundler (~> 1.0.0)
|
15
|
+
git (>= 1.2.5)
|
16
|
+
rake
|
17
|
+
libarchive (0.1.2)
|
18
|
+
libsvm-ruby-swig (0.4.0)
|
19
|
+
rake (0.8.7)
|
20
|
+
rcov (0.9.9)
|
21
|
+
shoulda (2.11.3)
|
22
|
+
|
23
|
+
PLATFORMS
|
24
|
+
ruby
|
25
|
+
|
26
|
+
DEPENDENCIES
|
27
|
+
ankusa
|
28
|
+
bundler (~> 1.0.0)
|
29
|
+
classifier
|
30
|
+
hoatzin (= 0.1.0)
|
31
|
+
jeweler (~> 1.5.2)
|
32
|
+
libarchive
|
33
|
+
rcov
|
34
|
+
shoulda
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,5 @@
|
|
1
|
+
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
|
2
|
+
|
3
|
+
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
4
|
+
|
5
|
+
You should have received a copy of the GNU General Public License along with this program. If not, see <www.gnu.org/licenses/>
|
data/README.markdown
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# classified - classifier abstraction and comparison framework
|
2
|
+
|
3
|
+
## Description
|
4
|
+
|
5
|
+
classified provides an abstract interface to common ruby classifiers. It allows comparison of these classifiers using common corpora to compare accuracy, precision, recall and f-measure metrics.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
gem install classified
|
10
|
+
bundle install
|
11
|
+
|
12
|
+
NOTE: You will need to have installed the libarchive library to support the decompression of the corpora used in the comparisons.
|
13
|
+
|
14
|
+
## Usage:
|
15
|
+
|
16
|
+
bundle exec rake metrics:all
|
17
|
+
|
18
|
+
## Supported classifiers
|
19
|
+
|
20
|
+
* ankusa - https://github.com/livingsocial/ankusa
|
21
|
+
* hoatzin - https://github.com/rattle/hoatzin
|
22
|
+
* classifier (bayes + lsi) - https://github.com/cardmagic/classifier
|
23
|
+
|
24
|
+
## Analysis
|
25
|
+
|
26
|
+
For further information on interpreting the results of the comparison, see http://streamhacker.com/2010/05/17/text-classification-sentiment-analysis-precision-recall/
|
27
|
+
|
28
|
+
## Further reading and acknowledgements
|
29
|
+
|
30
|
+
See http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/ for the original inspiration.
|
31
|
+
|
32
|
+
## Copyright and License
|
33
|
+
|
34
|
+
GPL v3 - See LICENSE.txt for details.
|
35
|
+
Copyright (c) 2010, Rob Lee
|
data/Rakefile
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'rake'
|
11
|
+
|
12
|
+
Dir[File.dirname(__FILE__)+"/lib/tasks/*.rake"].sort.each { |ext| load ext }
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "classified"
|
18
|
+
gem.homepage = "http://github.com/rjlee/classified"
|
19
|
+
gem.license = "GPLv3"
|
20
|
+
gem.summary = %Q{classifier abstraction and comparison framework}
|
21
|
+
gem.description = %Q{classified provides an abstract interface to common ruby classifiers. It allows comparison of these classifiers using common corpora to compare accuracy, precision, recall and f-measure metrics.}
|
22
|
+
gem.email = "robl[at]rjlee.net"
|
23
|
+
gem.authors = ["Rob Lee"]
|
24
|
+
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
25
|
+
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
26
|
+
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
27
|
+
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
28
|
+
end
|
29
|
+
Jeweler::RubygemsDotOrgTasks.new
|
30
|
+
|
31
|
+
require 'rake/testtask'
|
32
|
+
Rake::TestTask.new(:test) do |test|
|
33
|
+
test.libs << 'lib' << 'test'
|
34
|
+
test.pattern = 'test/**/test_*.rb'
|
35
|
+
test.verbose = true
|
36
|
+
end
|
37
|
+
|
38
|
+
require 'rcov/rcovtask'
|
39
|
+
Rcov::RcovTask.new do |test|
|
40
|
+
test.libs << 'test'
|
41
|
+
test.pattern = 'test/**/test_*.rb'
|
42
|
+
test.verbose = true
|
43
|
+
end
|
44
|
+
|
45
|
+
task :default => :test
|
46
|
+
|
47
|
+
require 'rake/rdoctask'
|
48
|
+
Rake::RDocTask.new do |rdoc|
|
49
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
50
|
+
|
51
|
+
rdoc.rdoc_dir = 'rdoc'
|
52
|
+
rdoc.title = "classified #{version}"
|
53
|
+
rdoc.rdoc_files.include('README*')
|
54
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
55
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.0
|
data/classified.gemspec
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{classified}
|
8
|
+
s.version = "0.0.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Rob Lee"]
|
12
|
+
s.date = %q{2011-01-07}
|
13
|
+
s.description = %q{classified provides an abstract interface to common ruby classifiers. It allows comparison of these classifiers using common corpora to compare accuracy, precision, recall and f-measure metrics.}
|
14
|
+
s.email = %q{robl[at]rjlee.net}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README.markdown"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
"Gemfile",
|
21
|
+
"Gemfile.lock",
|
22
|
+
"LICENSE.txt",
|
23
|
+
"README.markdown",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"classified.gemspec",
|
27
|
+
"lib/classified.rb",
|
28
|
+
"lib/classifiers/ankusa.rb",
|
29
|
+
"lib/classifiers/base.rb",
|
30
|
+
"lib/classifiers/classifier_bayes.rb",
|
31
|
+
"lib/classifiers/classifier_lsi.rb",
|
32
|
+
"lib/classifiers/hoatzin.rb",
|
33
|
+
"lib/corpus/utils.rb",
|
34
|
+
"lib/metrics.rb",
|
35
|
+
"lib/tasks/metrics.rake",
|
36
|
+
"test/corpora/movie_reviews/movie_reviews.zip",
|
37
|
+
"test/helper.rb",
|
38
|
+
"test/test_classified.rb"
|
39
|
+
]
|
40
|
+
s.homepage = %q{http://github.com/rjlee/classified}
|
41
|
+
s.licenses = ["GPLv3"]
|
42
|
+
s.require_paths = ["lib"]
|
43
|
+
s.rubygems_version = %q{1.3.7}
|
44
|
+
s.summary = %q{classifier abstraction and comparison framework}
|
45
|
+
s.test_files = [
|
46
|
+
"test/helper.rb",
|
47
|
+
"test/test_classified.rb"
|
48
|
+
]
|
49
|
+
|
50
|
+
if s.respond_to? :specification_version then
|
51
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
52
|
+
s.specification_version = 3
|
53
|
+
|
54
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
55
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
56
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
57
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
58
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
59
|
+
else
|
60
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
61
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
62
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
63
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
64
|
+
end
|
65
|
+
else
|
66
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
67
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
68
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
69
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
data/lib/classified.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'corpus/utils'
|
2
|
+
require 'metrics'
|
3
|
+
require 'classifiers/base'
|
4
|
+
require 'classifiers/ankusa'
|
5
|
+
require 'classifiers/hoatzin'
|
6
|
+
require 'classifiers/classifier_bayes'
|
7
|
+
require 'classifiers/classifier_lsi'
|
8
|
+
|
9
|
+
module Classified
|
10
|
+
class Classifiers
|
11
|
+
|
12
|
+
class InvalidClassifier < Exception; end
|
13
|
+
|
14
|
+
VALID_CLASSIFIERS = [:classifier_bayes, :ankusa, :hoatzin] # :classifier_lsi
|
15
|
+
|
16
|
+
def self.create options = {}
|
17
|
+
|
18
|
+
options = { :classifier => :ankusa }.merge!(options)
|
19
|
+
raise InvalidClassifier unless VALID_CLASSIFIERS.include?(options[:classifier])
|
20
|
+
|
21
|
+
case options[:classifier]
|
22
|
+
when :ankusa
|
23
|
+
return Classified::Ankusa.new(options)
|
24
|
+
when :hoatzin
|
25
|
+
return Classified::Hoatzin.new(options)
|
26
|
+
when :classifier_bayes
|
27
|
+
return Classified::ClassifierBayes.new(options)
|
28
|
+
when :classifier_lsi
|
29
|
+
return Classified::ClassifierLSI.new(options)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
if Gem.available?('ankusa')
|
2
|
+
require 'ankusa'
|
3
|
+
require 'ankusa/file_system_storage'
|
4
|
+
end
|
5
|
+
|
6
|
+
module Classified
|
7
|
+
class Ankusa < Base
|
8
|
+
attr_accessor :classifier
|
9
|
+
def initialize options = {}
|
10
|
+
case options.delete(:storage)
|
11
|
+
when :memory
|
12
|
+
@storage = ::Ankusa::MemoryStorage.new
|
13
|
+
when :file
|
14
|
+
@storage = ::Ankusa::FileSystemStorage.new options.delete(:file)
|
15
|
+
else
|
16
|
+
@storage = ::Ankusa::MemoryStorage.new
|
17
|
+
end
|
18
|
+
@classifier = ::Ankusa::NaiveBayesClassifier.new @storage
|
19
|
+
end
|
20
|
+
|
21
|
+
def train classification, text
|
22
|
+
@classifier.train classification, text
|
23
|
+
end
|
24
|
+
|
25
|
+
def classify text
|
26
|
+
@classifier.classify text
|
27
|
+
end
|
28
|
+
|
29
|
+
def classifications
|
30
|
+
@classifier.classnames
|
31
|
+
end
|
32
|
+
|
33
|
+
def save
|
34
|
+
# TODO: confirm this is a support method for the classifier
|
35
|
+
@storage.save
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'classifier' if Gem.available?('classifier')
|
2
|
+
|
3
|
+
module Classified
|
4
|
+
class ClassifierBayes < Base
|
5
|
+
attr_accessor :classifier, :options
|
6
|
+
def initialize options = {}
|
7
|
+
@options = { :classes => ['Positive', 'Negative']}.merge!(options)
|
8
|
+
@classifier = ::Classifier::Bayes.new 'Positive', 'Negative'
|
9
|
+
#@classifier = ::Classifier::Bayes.new options[:classes]
|
10
|
+
end
|
11
|
+
|
12
|
+
def train classification, text
|
13
|
+
@classifier.send("train_#{classification.to_s}", text)
|
14
|
+
end
|
15
|
+
|
16
|
+
def classify text
|
17
|
+
@classifier.classify text
|
18
|
+
end
|
19
|
+
|
20
|
+
def classifications
|
21
|
+
@options[:classes]
|
22
|
+
end
|
23
|
+
|
24
|
+
def transform text
|
25
|
+
text.to_s.capitalize
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'classifier' if Gem.available?('classifier')
|
2
|
+
|
3
|
+
module Classified
|
4
|
+
class ClassifierLSI < Base
|
5
|
+
attr_accessor :classifier, :options
|
6
|
+
def initialize options = {}
|
7
|
+
@options = { :classes => [:positive, :negative]}.merge!(options)
|
8
|
+
@classifier = ::Classifier::LSI.new
|
9
|
+
#@classifier = ::Classifier::Bayes.new options[:classes]
|
10
|
+
end
|
11
|
+
|
12
|
+
def train classification, text
|
13
|
+
@classifier.add_item(text, "train_#{classification.to_s}")
|
14
|
+
end
|
15
|
+
|
16
|
+
def classify text
|
17
|
+
@classifier.classify text
|
18
|
+
end
|
19
|
+
|
20
|
+
def classifications
|
21
|
+
@options[:classes]
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'hoatzin' if Gem.available?('hoatzin')
|
2
|
+
|
3
|
+
module Classified
|
4
|
+
class Hoatzin < Base
|
5
|
+
attr_accessor :classifier
|
6
|
+
def initialize options = {}
|
7
|
+
@classifier = ::Hoatzin::Classifier.new options
|
8
|
+
end
|
9
|
+
|
10
|
+
def train classification, text
|
11
|
+
@classifier.train classification, text
|
12
|
+
end
|
13
|
+
|
14
|
+
def classify text
|
15
|
+
@classifier.classify text
|
16
|
+
end
|
17
|
+
|
18
|
+
def classifications
|
19
|
+
@classifier.classifications
|
20
|
+
end
|
21
|
+
|
22
|
+
def save options = {}
|
23
|
+
# TODO: confirm this is a support method for the classifier
|
24
|
+
@classifier.save options
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
data/lib/corpus/utils.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'libarchive' if Gem.available?('libarchive')
|
2
|
+
|
3
|
+
module Classified
|
4
|
+
module Corpus
|
5
|
+
class Utils
|
6
|
+
|
7
|
+
def self.load_movie_reviews
|
8
|
+
# See if we've already unpacked them
|
9
|
+
base_dir = File.join(File.dirname(__FILE__), '..', '..', 'test', 'corpora', 'movie_reviews')
|
10
|
+
classification_count = { :negative => 0, :positive => 0}
|
11
|
+
movie_review_corpus = []
|
12
|
+
if File.directory?(File.join(base_dir, 'movie_reviews', 'neg'))
|
13
|
+
['negative', 'positive'].each do |classification|
|
14
|
+
dir = File.join(base_dir, 'movie_reviews', classification[0,3])
|
15
|
+
Dir[dir + "/*.txt"].each do |filename|
|
16
|
+
movie_review_corpus << {:classification => classification.to_sym, :text => File.read(filename) }
|
17
|
+
classification_count[classification.to_sym] +=1
|
18
|
+
end
|
19
|
+
end
|
20
|
+
else
|
21
|
+
::Archive.read_open_filename(File.join(base_dir, 'movie_reviews.zip')) do |ar|
|
22
|
+
while entry = ar.next_header
|
23
|
+
name = entry.pathname
|
24
|
+
if name =~ /\/$/
|
25
|
+
FileUtils.mkdir_p File.join(base_dir, name)
|
26
|
+
next
|
27
|
+
end
|
28
|
+
classification = name =~ /neg/ ? :negative : :positive
|
29
|
+
text = ar.read_data
|
30
|
+
movie_review_corpus << { :classification => classification, :text => text }
|
31
|
+
file = File.join(base_dir, name)
|
32
|
+
File.open(file, 'w') {|f| f.write(text) }
|
33
|
+
classification_count[classification.to_sym] +=1
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
movie_review = { :training => [], :test => [] }
|
39
|
+
count = { :positive => 0, :negative => 0}
|
40
|
+
movie_review_corpus.each do |doc|
|
41
|
+
count[doc[:classification]]+=1
|
42
|
+
if count[doc[:classification]] > classification_count[doc[:classification]]*3/4
|
43
|
+
# Add to test set
|
44
|
+
movie_review[:test] << doc
|
45
|
+
else
|
46
|
+
# Add to training set
|
47
|
+
movie_review[:training] << doc
|
48
|
+
end
|
49
|
+
end
|
50
|
+
movie_review
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.load_twitter_sentiment(file)
|
54
|
+
docs = []
|
55
|
+
classification_count = { :positive => 0, :negative => 0 }
|
56
|
+
File.open( file ) do |yf|
|
57
|
+
YAML.load_documents( yf ) do |status|
|
58
|
+
docs << { :text => status[:text], :classification => status[:classification] }
|
59
|
+
classification_count[status[:classification]] +=1
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
corpus = { :training => [], :test => [] }
|
64
|
+
count = { :positive => 0, :negative => 0}
|
65
|
+
docs.each do |doc|
|
66
|
+
count[doc[:classification]]+=1
|
67
|
+
if count[doc[:classification]] > classification_count[doc[:classification]]*3/4
|
68
|
+
# Add to test set
|
69
|
+
corpus[:test] << doc
|
70
|
+
else
|
71
|
+
# Add to training set
|
72
|
+
corpus[:training] << doc
|
73
|
+
end
|
74
|
+
end
|
75
|
+
corpus
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/metrics.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
module Classified
|
2
|
+
class Metrics
|
3
|
+
|
4
|
+
def self.train_classifier classifier, corpus
|
5
|
+
puts "Training"
|
6
|
+
start_time = Time.now
|
7
|
+
corpus.each do |doc|
|
8
|
+
classifier.train doc[:classification], doc[:text]
|
9
|
+
end
|
10
|
+
# Call a classify action to force an syncing in the classifier to occur
|
11
|
+
classifier.classify("It's rubbish")
|
12
|
+
#classifier.save(:metadata => './metadata', :model => './model')
|
13
|
+
puts "Took #{Time.now-start_time} secs : training #{corpus.length} docs"
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.test_classifier classifier, test_corpus
|
17
|
+
alpha = 0.5
|
18
|
+
total = correct = 0
|
19
|
+
metrics = {}
|
20
|
+
|
21
|
+
puts "Getting metrics"
|
22
|
+
start_time = Time.now
|
23
|
+
|
24
|
+
classifier.classifications.each do |c|
|
25
|
+
# tc - true class
|
26
|
+
# fc - false class
|
27
|
+
tc = c
|
28
|
+
fc = classifier.classifications.clone
|
29
|
+
fc.delete(tc)
|
30
|
+
fc = fc.first
|
31
|
+
metrics[tc] = { :precision => 0, :recall => 0, :fmeasure => 0 }
|
32
|
+
scores = {}
|
33
|
+
classifier.classifications.each do |classification|
|
34
|
+
scores[classification] = { :true => 0, :false => 0 }
|
35
|
+
end
|
36
|
+
test_corpus.each do |doc|
|
37
|
+
c = classifier.transform(doc[:classification])
|
38
|
+
if classifier.classify(doc[:text]) == c
|
39
|
+
scores[c][:true]+=1
|
40
|
+
else
|
41
|
+
scores[c][:false]+=1
|
42
|
+
end
|
43
|
+
end
|
44
|
+
correct += scores[tc][:true]
|
45
|
+
total += scores[tc][:true] + scores[tc][:false]
|
46
|
+
metrics[tc][:precision] = scores[tc][:true].to_f / (scores[tc][:true] + scores[tc][:false])
|
47
|
+
metrics[tc][:recall] = scores[tc][:true].to_f / (scores[tc][:true] + scores[fc][:false])
|
48
|
+
metrics[tc][:fmeasure] = 1.0/(alpha/metrics[tc][:precision] + (1-alpha)/metrics[tc][:recall])
|
49
|
+
end
|
50
|
+
|
51
|
+
metrics[:correct] = correct
|
52
|
+
metrics[:total] = total
|
53
|
+
metrics[:accuracy] = correct.to_f / total
|
54
|
+
metrics[:duration] = Time.now-start_time
|
55
|
+
|
56
|
+
puts "Took #{metrics[:duration]} secs"
|
57
|
+
|
58
|
+
metrics
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.summary(classifier, metrics)
|
62
|
+
puts "average classification time = #{metrics[:duration]/metrics[:total].to_f} secs"
|
63
|
+
puts "total classifications = #{metrics[:total]}"
|
64
|
+
puts "correct classifications = #{metrics[:correct]}"
|
65
|
+
puts "accuracy = #{metrics[:accuracy]}"
|
66
|
+
classifier.classifications.each do |classification|
|
67
|
+
puts "#{classification} classification metrics"
|
68
|
+
metrics[classification].each_key do |metric|
|
69
|
+
puts "\t#{metric} = #{metrics[classification][metric]}"
|
70
|
+
end
|
71
|
+
puts
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'pp'
|
2
|
+
require 'rubygems'
|
3
|
+
require 'rake'
|
4
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..' ))
|
5
|
+
require 'classified'
|
6
|
+
include Classified
|
7
|
+
|
8
|
+
namespace :metrics do
|
9
|
+
|
10
|
+
task :evaluate, :corpus do |t, args|
|
11
|
+
args.with_defaults(:corpus => 'movie_reviews')
|
12
|
+
puts "Loading corpus #{args[:corpus]}"
|
13
|
+
corpus = nil
|
14
|
+
case args[:corpus]
|
15
|
+
when 'movie_reviews'
|
16
|
+
corpus = Corpus::Utils.load_movie_reviews
|
17
|
+
when 'twitter_sentiment'
|
18
|
+
corpus = Corpus::Utils.load_twitter_sentiment
|
19
|
+
end
|
20
|
+
|
21
|
+
puts
|
22
|
+
Classifiers::VALID_CLASSIFIERS.each do |c|
|
23
|
+
classifier = Classifiers.create(:classifier => c)
|
24
|
+
puts "Analysing classifier: #{c}"
|
25
|
+
Metrics.train_classifier classifier, corpus[:training]
|
26
|
+
puts
|
27
|
+
metrics = Metrics.test_classifier classifier, corpus[:test]
|
28
|
+
puts
|
29
|
+
Metrics.summary(classifier, metrics)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
task :all do
|
34
|
+
Rake::Task['metrics:evaluate'].invoke('movie_reviews')
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
Binary file
|
data/test/helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'classified'
|
16
|
+
|
17
|
+
require 'libarchive'
|
18
|
+
|
19
|
+
class Test::Unit::TestCase
|
20
|
+
|
21
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require 'pp'
|
3
|
+
|
4
|
+
class TestClassified < Test::Unit::TestCase
|
5
|
+
|
6
|
+
context "With the default ankusa classifier" do
|
7
|
+
|
8
|
+
setup do
|
9
|
+
@c = Classified::Classifiers.create(:storage => :memory)
|
10
|
+
end
|
11
|
+
|
12
|
+
should "support training and classification" do
|
13
|
+
assert_equal Hash[:nice, 1], @c.train(:positive, "Thats nice")
|
14
|
+
assert_equal :positive, @c.classify("Thats nice")
|
15
|
+
end
|
16
|
+
|
17
|
+
context "with a trained moview review corpus" do
|
18
|
+
|
19
|
+
setup do
|
20
|
+
corpus = Classified::Corpus::Utils.load_movie_reviews
|
21
|
+
Classified::Metrics.train_classifier @c, corpus[:training]
|
22
|
+
end
|
23
|
+
|
24
|
+
should "correctly classify text" do
|
25
|
+
assert_equal :negative, @c.classify("It was rubbish")
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
metadata
ADDED
@@ -0,0 +1,146 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: classified
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 31
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 0
|
10
|
+
version: 0.0.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Rob Lee
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-01-07 00:00:00 +00:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
prerelease: false
|
23
|
+
name: shoulda
|
24
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
requirement: *id001
|
34
|
+
type: :development
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
prerelease: false
|
37
|
+
name: bundler
|
38
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 23
|
44
|
+
segments:
|
45
|
+
- 1
|
46
|
+
- 0
|
47
|
+
- 0
|
48
|
+
version: 1.0.0
|
49
|
+
requirement: *id002
|
50
|
+
type: :development
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
prerelease: false
|
53
|
+
name: jeweler
|
54
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ~>
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
hash: 7
|
60
|
+
segments:
|
61
|
+
- 1
|
62
|
+
- 5
|
63
|
+
- 2
|
64
|
+
version: 1.5.2
|
65
|
+
requirement: *id003
|
66
|
+
type: :development
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
prerelease: false
|
69
|
+
name: rcov
|
70
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 3
|
76
|
+
segments:
|
77
|
+
- 0
|
78
|
+
version: "0"
|
79
|
+
requirement: *id004
|
80
|
+
type: :development
|
81
|
+
description: classified provides an abstract interface to common ruby classifiers. It allows comparison of these classifiers using common corpora to compare accuracy, precision, recall and f-measure metrics.
|
82
|
+
email: robl[at]rjlee.net
|
83
|
+
executables: []
|
84
|
+
|
85
|
+
extensions: []
|
86
|
+
|
87
|
+
extra_rdoc_files:
|
88
|
+
- LICENSE.txt
|
89
|
+
- README.markdown
|
90
|
+
files:
|
91
|
+
- Gemfile
|
92
|
+
- Gemfile.lock
|
93
|
+
- LICENSE.txt
|
94
|
+
- README.markdown
|
95
|
+
- Rakefile
|
96
|
+
- VERSION
|
97
|
+
- classified.gemspec
|
98
|
+
- lib/classified.rb
|
99
|
+
- lib/classifiers/ankusa.rb
|
100
|
+
- lib/classifiers/base.rb
|
101
|
+
- lib/classifiers/classifier_bayes.rb
|
102
|
+
- lib/classifiers/classifier_lsi.rb
|
103
|
+
- lib/classifiers/hoatzin.rb
|
104
|
+
- lib/corpus/utils.rb
|
105
|
+
- lib/metrics.rb
|
106
|
+
- lib/tasks/metrics.rake
|
107
|
+
- test/corpora/movie_reviews/movie_reviews.zip
|
108
|
+
- test/helper.rb
|
109
|
+
- test/test_classified.rb
|
110
|
+
has_rdoc: true
|
111
|
+
homepage: http://github.com/rjlee/classified
|
112
|
+
licenses:
|
113
|
+
- GPLv3
|
114
|
+
post_install_message:
|
115
|
+
rdoc_options: []
|
116
|
+
|
117
|
+
require_paths:
|
118
|
+
- lib
|
119
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
120
|
+
none: false
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
hash: 3
|
125
|
+
segments:
|
126
|
+
- 0
|
127
|
+
version: "0"
|
128
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ">="
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
hash: 3
|
134
|
+
segments:
|
135
|
+
- 0
|
136
|
+
version: "0"
|
137
|
+
requirements: []
|
138
|
+
|
139
|
+
rubyforge_project:
|
140
|
+
rubygems_version: 1.3.7
|
141
|
+
signing_key:
|
142
|
+
specification_version: 3
|
143
|
+
summary: classifier abstraction and comparison framework
|
144
|
+
test_files:
|
145
|
+
- test/helper.rb
|
146
|
+
- test/test_classified.rb
|