hoatzin 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +16 -0
- data/Gemfile.lock +24 -0
- data/LICENSE.txt +20 -0
- data/README.markdown +48 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/hoatzin.gemspec +73 -0
- data/lib/hoatzin.rb +189 -0
- data/test/helper.rb +43 -0
- data/test/models/readonly-test/metadata +16 -0
- data/test/models/readonly-test/model +0 -0
- data/test/models/test/metadata +16 -0
- data/test/models/test/model +0 -0
- data/test/test_hoatzin.rb +74 -0
- metadata +161 -0
data/Gemfile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
gem 'libsvm-ruby-swig'
|
7
|
+
gem 'fast-stemmer'
|
8
|
+
|
9
|
+
# Add dependencies to develop your gem here.
|
10
|
+
# Include everything needed to run rake, tests, features, etc.
|
11
|
+
group :development do
|
12
|
+
gem "shoulda", ">= 0"
|
13
|
+
gem "bundler", "~> 1.0.0"
|
14
|
+
gem "jeweler", "~> 1.5.2"
|
15
|
+
gem "rcov", ">= 0"
|
16
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
fast-stemmer (1.0.0)
|
5
|
+
git (1.2.5)
|
6
|
+
jeweler (1.5.2)
|
7
|
+
bundler (~> 1.0.0)
|
8
|
+
git (>= 1.2.5)
|
9
|
+
rake
|
10
|
+
libsvm-ruby-swig (0.4.0)
|
11
|
+
rake (0.8.7)
|
12
|
+
rcov (0.9.9)
|
13
|
+
shoulda (2.11.3)
|
14
|
+
|
15
|
+
PLATFORMS
|
16
|
+
ruby
|
17
|
+
|
18
|
+
DEPENDENCIES
|
19
|
+
bundler (~> 1.0.0)
|
20
|
+
fast-stemmer
|
21
|
+
jeweler (~> 1.5.2)
|
22
|
+
libsvm-ruby-swig
|
23
|
+
rcov
|
24
|
+
shoulda
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 robl
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# hoatzin
|
2
|
+
|
3
|
+
Hoatzin is a text classifier in Ruby that uses SVM for it's classification.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
gem install hoatzin
|
8
|
+
|
9
|
+
## Usage
|
10
|
+
|
11
|
+
# Create a hoatzin classifier
|
12
|
+
c = Hoatzin::Classifier.new()
|
13
|
+
|
14
|
+
# Train the classifier with a classification and some text
|
15
|
+
c.train(:positive, "Thats nice")
|
16
|
+
|
17
|
+
# This will return the most likely classification (:positive)
|
18
|
+
c.classify("Thats nice")
|
19
|
+
|
20
|
+
## Storage
|
21
|
+
|
22
|
+
The Hoatzin classifier supports saving your trained classifier to the filesystem. It needs
|
23
|
+
to store the libsvm model and the associated metadata as two separate files.
|
24
|
+
|
25
|
+
# Load a previously trained classifier
|
26
|
+
c = Hoatzin::Classifier.new(:metadata => '/path/to/file', :model => '/path/to/file')
|
27
|
+
|
28
|
+
# Save an existing trained classifier, without training data
|
29
|
+
# The #train method will raise an exception if called when the classifier is reloaded
|
30
|
+
c.save(:metadata => '/path/to/file', :model => '/path/to/file')
|
31
|
+
|
32
|
+
# Save an existing trained classifier, with training data
|
33
|
+
# The #train method can continue to be called when the classifier is reloaded
|
34
|
+
c.save(:metadata => '/path/to/file', :model => '/path/to/file', :update => true)
|
35
|
+
|
36
|
+
The classifier can continue to be trained if the model is saved with the :update => true option,
|
37
|
+
however the files stored on the filesystem will be much larger as they will contain copies
|
38
|
+
of all the documents used during training the classifier. It is generally advised to save without
|
39
|
+
the :update => true option unless it is definitely required.
|
40
|
+
|
41
|
+
## Acknowledgements
|
42
|
+
|
43
|
+
See http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/ for the original inspiration.
|
44
|
+
|
45
|
+
## Copyright and License
|
46
|
+
|
47
|
+
GPL v3 - See LICENSE.txt for details.
|
48
|
+
Copyright (c) 2010, Rob Lee
|
data/Rakefile
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'rake'
|
11
|
+
|
12
|
+
require 'jeweler'
|
13
|
+
Jeweler::Tasks.new do |gem|
|
14
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
15
|
+
gem.name = "hoatzin"
|
16
|
+
gem.homepage = "http://github.com/rjlee/hoatzin"
|
17
|
+
gem.license = "MIT"
|
18
|
+
gem.summary = %Q{SVM Classifier in Ruby}
|
19
|
+
gem.description = %Q{Hoatzin is a text classifier in Ruby that uses SVM for it's classification.}
|
20
|
+
gem.email = "robl@rjlee.net"
|
21
|
+
gem.authors = ["robl"]
|
22
|
+
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
23
|
+
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
24
|
+
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
25
|
+
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
26
|
+
end
|
27
|
+
Jeweler::RubygemsDotOrgTasks.new
|
28
|
+
|
29
|
+
require 'rake/testtask'
|
30
|
+
Rake::TestTask.new(:test) do |test|
|
31
|
+
test.libs << 'lib' << 'test'
|
32
|
+
test.pattern = 'test/**/test_*.rb'
|
33
|
+
test.verbose = true
|
34
|
+
end
|
35
|
+
|
36
|
+
require 'rcov/rcovtask'
|
37
|
+
Rcov::RcovTask.new do |test|
|
38
|
+
test.libs << 'test'
|
39
|
+
test.pattern = 'test/**/test_*.rb'
|
40
|
+
test.verbose = true
|
41
|
+
end
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rake/rdoctask'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
48
|
+
|
49
|
+
rdoc.rdoc_dir = 'rdoc'
|
50
|
+
rdoc.title = "hoatzin #{version}"
|
51
|
+
rdoc.rdoc_files.include('README*')
|
52
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
53
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/hoatzin.gemspec
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{hoatzin}
|
8
|
+
s.version = "0.1.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["robl"]
|
12
|
+
s.date = %q{2010-12-31}
|
13
|
+
s.description = %q{Hoatzin is a text classifier in Ruby that uses SVM for it's classification.}
|
14
|
+
s.email = %q{robl@rjlee.net}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README.markdown"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
"Gemfile",
|
21
|
+
"Gemfile.lock",
|
22
|
+
"LICENSE.txt",
|
23
|
+
"README.markdown",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"hoatzin.gemspec",
|
27
|
+
"lib/hoatzin.rb",
|
28
|
+
"test/helper.rb",
|
29
|
+
"test/models/readonly-test/metadata",
|
30
|
+
"test/models/readonly-test/model",
|
31
|
+
"test/models/test/metadata",
|
32
|
+
"test/models/test/model",
|
33
|
+
"test/test_hoatzin.rb"
|
34
|
+
]
|
35
|
+
s.homepage = %q{http://github.com/rjlee/hoatzin}
|
36
|
+
s.licenses = ["MIT"]
|
37
|
+
s.require_paths = ["lib"]
|
38
|
+
s.rubygems_version = %q{1.3.7}
|
39
|
+
s.summary = %q{SVM Classifier in Ruby}
|
40
|
+
s.test_files = [
|
41
|
+
"test/helper.rb",
|
42
|
+
"test/test_hoatzin.rb"
|
43
|
+
]
|
44
|
+
|
45
|
+
if s.respond_to? :specification_version then
|
46
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
47
|
+
s.specification_version = 3
|
48
|
+
|
49
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
50
|
+
s.add_runtime_dependency(%q<libsvm-ruby-swig>, [">= 0"])
|
51
|
+
s.add_runtime_dependency(%q<fast-stemmer>, [">= 0"])
|
52
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
53
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
54
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
|
55
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
56
|
+
else
|
57
|
+
s.add_dependency(%q<libsvm-ruby-swig>, [">= 0"])
|
58
|
+
s.add_dependency(%q<fast-stemmer>, [">= 0"])
|
59
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
60
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
61
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
62
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
63
|
+
end
|
64
|
+
else
|
65
|
+
s.add_dependency(%q<libsvm-ruby-swig>, [">= 0"])
|
66
|
+
s.add_dependency(%q<fast-stemmer>, [">= 0"])
|
67
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
68
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
69
|
+
s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
|
70
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
data/lib/hoatzin.rb
ADDED
@@ -0,0 +1,189 @@
|
|
1
|
+
require 'svm'
|
2
|
+
require 'fast_stemmer'
|
3
|
+
require 'iconv'
|
4
|
+
|
5
|
+
module Hoatzin
|
6
|
+
class Classifier
|
7
|
+
|
8
|
+
class ReadOnly < Exception; end
|
9
|
+
|
10
|
+
attr_reader :classifications
|
11
|
+
|
12
|
+
def initialize options = {}
|
13
|
+
|
14
|
+
@metadata_file = options.delete(:metadata) || nil
|
15
|
+
@model_file = options.delete(:model) || nil
|
16
|
+
@documents = []
|
17
|
+
@dictionary = []
|
18
|
+
@classifications = []
|
19
|
+
@labels = []
|
20
|
+
@feature_vectors = []
|
21
|
+
@problem = @model = nil
|
22
|
+
@cache = 0
|
23
|
+
@readonly = false
|
24
|
+
|
25
|
+
# If we have model and metadata files then load them
|
26
|
+
load if @metadata_file && @model_file
|
27
|
+
|
28
|
+
# Define kernel parameters for libsvm
|
29
|
+
@parameters = Parameter.new(:C => 100,
|
30
|
+
:degree => 1,
|
31
|
+
:coef0 => 0,
|
32
|
+
:eps => 0.001)
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
def train classification, text
|
37
|
+
# Only allow retraining if we have all the required data
|
38
|
+
raise ReadOnly if @readonly
|
39
|
+
|
40
|
+
# Add the classification if we haven't seen it before
|
41
|
+
@classifications << classification unless @classifications.include?(classification)
|
42
|
+
|
43
|
+
# Tokenize the text
|
44
|
+
tokens = Classifier.tokenize(text)
|
45
|
+
|
46
|
+
# Add tokens to word list
|
47
|
+
@dictionary << tokens
|
48
|
+
@dictionary.flatten!.uniq!
|
49
|
+
|
50
|
+
# Add to list of documents
|
51
|
+
@documents << tokens
|
52
|
+
|
53
|
+
# Add classification to classification list
|
54
|
+
@labels << @classifications.index(classification)
|
55
|
+
|
56
|
+
# Compute the feature vectors
|
57
|
+
@feature_vectors = @documents.map { |doc| @dictionary.map{|x| doc.include?(x) ? 1 : 0} }
|
58
|
+
end
|
59
|
+
|
60
|
+
def classify text
|
61
|
+
# Only update the model if we've trained more documents since it was last updated
|
62
|
+
if !@readonly && @documents.length > @cache
|
63
|
+
return nil if @documents.length == 0
|
64
|
+
@cache = @documents.length
|
65
|
+
assign_model
|
66
|
+
end
|
67
|
+
|
68
|
+
# Tokenize the text
|
69
|
+
tokens = Classifier.tokenize(text)
|
70
|
+
|
71
|
+
# Calculate the feature vectors for the text to be classified
|
72
|
+
f_vector = @dictionary.map{|x| tokens.include?(x) ? 1 : 0}
|
73
|
+
|
74
|
+
# Classify and return classification
|
75
|
+
pred, probs = @model.predict_probability(f_vector)
|
76
|
+
@classifications[pred.to_i]
|
77
|
+
end
|
78
|
+
|
79
|
+
def save options = {}
|
80
|
+
@metadata_file = options[:metadata] if options.key?(:metadata)
|
81
|
+
@model_file = options[:model] if options.key?(:model)
|
82
|
+
return false unless (@metadata_file && @model_file)
|
83
|
+
data = { :dictionary => @dictionary, :classifications => @classifications}
|
84
|
+
data.merge!(:documents => @documents,
|
85
|
+
:labels => @labels,
|
86
|
+
:feature_vectors => @feature_vectors,
|
87
|
+
:cache => @cache) if options[:update]
|
88
|
+
File.open(@metadata_file, 'w+') { |f| Marshal.dump(data, f) }
|
89
|
+
assign_model if @model.nil?
|
90
|
+
@model.save(@model_file)
|
91
|
+
end
|
92
|
+
|
93
|
+
protected
|
94
|
+
def load
|
95
|
+
data = {}
|
96
|
+
File.open(@metadata_file) { |f| data = Marshal.load(f) }
|
97
|
+
@dictionary = data[:dictionary]
|
98
|
+
@classifications = data[:classifications]
|
99
|
+
if data.key?(:documents)
|
100
|
+
@documents = data[:documents]
|
101
|
+
@labels = data[:labels]
|
102
|
+
@feature_vectors = data[:feature_vectors]
|
103
|
+
@cache = data[:cache]
|
104
|
+
end
|
105
|
+
@readonly = @documents.length > 0 ? false : true
|
106
|
+
@model = Model.new(@model_file)
|
107
|
+
end
|
108
|
+
|
109
|
+
def assign_model
|
110
|
+
@problem = Problem.new(@labels, @feature_vectors)
|
111
|
+
@model = Model.new(@problem, @parameters)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Adapted from ankusa, to replace with tokenizer gem
|
115
|
+
def self.tokenize text
|
116
|
+
tokens = []
|
117
|
+
# from http://www.jroller.com/obie/tags/unicode
|
118
|
+
converter = Iconv.new('ASCII//IGNORE//TRANSLIT', 'UTF-8')
|
119
|
+
converter.iconv(text).unpack('U*').select { |cp| cp < 127 }.pack('U*') rescue ""
|
120
|
+
text.tr('-', ' ').gsub(/[^\w\s]/," ").split.each do |token|
|
121
|
+
tokens << token if (token.length > 3 && !Classifier.stop_words.include?(token))
|
122
|
+
end
|
123
|
+
tokens
|
124
|
+
end
|
125
|
+
|
126
|
+
# ftp://ftp.cs.cornell.edu/pub/smart/english.stop
|
127
|
+
def self.stop_words
|
128
|
+
%w{
|
129
|
+
a a's able about above according accordingly across actually after
|
130
|
+
afterwards again against ain't all allow allows almost alone along
|
131
|
+
already also although always am among amongst an and another any
|
132
|
+
anybody anyhow anyone anything anyway anyways anywhere apart appear
|
133
|
+
appreciate appropriate are aren't around as aside ask asking
|
134
|
+
associated at available away awfully b be became because become
|
135
|
+
becomes becoming been before beforehand behind being believe below
|
136
|
+
beside besides best better between beyond both brief but by c
|
137
|
+
c'mon c's came can can't cannot cant cause causes certain certainly
|
138
|
+
changes clearly co com come comes concerning consequently consider
|
139
|
+
considering contain containing contains corresponding could couldn't
|
140
|
+
course currently d definitely described despite did didn't different
|
141
|
+
do does doesn't doing don't done down downwards during e each edu
|
142
|
+
eg eight either else elsewhere enough entirely especially et etc
|
143
|
+
even ever every everybody everyone everything everywhere ex exactly
|
144
|
+
example except f far few fifth first five followed following follows
|
145
|
+
for former formerly forth four from further furthermore g get gets
|
146
|
+
getting given gives go goes going gone got gotten greetings h had
|
147
|
+
hadn't happens hardly has hasn't have haven't having he he's hello
|
148
|
+
help hence her here here's hereafter hereby herein hereupon hers
|
149
|
+
herself hi him himself his hither hopefully how howbeit however i
|
150
|
+
i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed
|
151
|
+
indicate indicated indicates inner insofar instead into inward is
|
152
|
+
isn't it it'd it'll it's its itself j just k keep keeps kept know
|
153
|
+
knows known l last lately later latter latterly least less lest let
|
154
|
+
let's like liked likely little look looking looks ltd m mainly many
|
155
|
+
may maybe me mean meanwhile merely might more moreover most mostly
|
156
|
+
much must my myself n name namely nd near nearly necessary need needs
|
157
|
+
neither never nevertheless new next nine no nobody non none noone
|
158
|
+
nor normally not nothing novel now nowhere o obviously of off often
|
159
|
+
oh ok okay old on once one ones only onto or other others otherwise
|
160
|
+
ought our ours ourselves out outside over overall own p particular
|
161
|
+
particularly per perhaps placed please plus possible presumably
|
162
|
+
probably provides q que quite qv r rather rd re really reasonably
|
163
|
+
regarding regardless regards relatively respectively right s said
|
164
|
+
same saw say saying says second secondly see seeing seem seemed
|
165
|
+
seeming seems seen self selves sensible sent serious seriously
|
166
|
+
seven several shall she should shouldn't since six so some somebody
|
167
|
+
somehow someone something sometime sometimes somewhat somewhere soon
|
168
|
+
sorry specified specify specifying still sub such sup sure t t's
|
169
|
+
take taken tell tends th than thank thanks thanx that that's thats
|
170
|
+
the their theirs them themselves then thence there there's thereafter
|
171
|
+
thereby therefore therein theres thereupon these they they'd they'll
|
172
|
+
they're they've think third this thorough thoroughly those though
|
173
|
+
three through throughout thru thus to together too took toward
|
174
|
+
towards tried tries truly try trying twice two u un under
|
175
|
+
unfortunately unless unlikely until unto up upon us use used useful
|
176
|
+
uses using usually uucp v value various very via viz vs w want wants
|
177
|
+
was wasn't way we we'd we'll we're we've welcome well went were weren't
|
178
|
+
what what's whatever when whence whenever where where's whereafter
|
179
|
+
whereas whereby wherein whereupon wherever whether which while
|
180
|
+
whither who who's whoever whole whom whose why will willing wish
|
181
|
+
with within without won't wonder would would wouldn't x y yes yet
|
182
|
+
you you'd you'll you're you've your yours yourself yourselves
|
183
|
+
z zero
|
184
|
+
}
|
185
|
+
end
|
186
|
+
|
187
|
+
|
188
|
+
end
|
189
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'hoatzin'
|
16
|
+
|
17
|
+
class Test::Unit::TestCase
|
18
|
+
|
19
|
+
TRAINING_LABELS = [1, 1, 0, 1, 1, 0, 0]
|
20
|
+
TRAINING_DOCS = [
|
21
|
+
'FREE NATIONAL TREASURE',
|
22
|
+
'FREE TV for EVERY visitor',
|
23
|
+
'Peter and Stewie are hilarious',
|
24
|
+
'AS SEEN ON NATIONAL TV',
|
25
|
+
'FREE drugs',
|
26
|
+
'New episode rocks, Peter and Stewie are hilarious',
|
27
|
+
'Peter is my fav!'
|
28
|
+
# ...
|
29
|
+
]
|
30
|
+
|
31
|
+
TESTING_LABELS = [1, 0, 0]
|
32
|
+
TESTING_DOCS = [
|
33
|
+
'FREE lotterry for the NATIONAL TREASURE !!!',
|
34
|
+
'Stewie is hilarious',
|
35
|
+
'Poor Peter ... hilarious',
|
36
|
+
# ...
|
37
|
+
]
|
38
|
+
|
39
|
+
READONLY_METADATA_FILE = File.join(File.dirname(__FILE__), 'models', 'readonly-test', 'model')
|
40
|
+
READONLY_MODEL_FILE = File.join(File.dirname(__FILE__), 'models', 'readonly-test', 'metadata')
|
41
|
+
METADATA_FILE = File.join(File.dirname(__FILE__), 'models', 'test', 'model')
|
42
|
+
MODEL_FILE = File.join(File.dirname(__FILE__), 'models', 'test', 'metadata')
|
43
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
svm_type c_svc
|
2
|
+
kernel_type rbf
|
3
|
+
gamma 0.0454545
|
4
|
+
nr_class 2
|
5
|
+
total_sv 7
|
6
|
+
rho -0.00785397
|
7
|
+
label 0 1
|
8
|
+
nr_sv 4 3
|
9
|
+
SV
|
10
|
+
1.178526450216892 0:1 1:1 2:1 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
|
11
|
+
0.8670862694825178 0:1 1:0 2:0 3:1 4:1 5:1 6:1 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
|
12
|
+
1.565208229773764 0:0 1:1 2:0 3:1 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:1 13:1 14:1 15:0 16:0 17:0 18:0 19:0 20:0 21:0
|
13
|
+
3.06895439364994 0:1 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:1 16:0 17:0 18:0 19:0 20:0 21:0
|
14
|
+
-2.445428417806817 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:1 10:1 11:1 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
|
15
|
+
-0.5364005807527658 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:1 10:1 11:1 12:0 13:0 14:0 15:0 16:1 17:1 18:1 19:0 20:0 21:0
|
16
|
+
-3.697946344563531 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:1 20:1 21:1
|
Binary file
|
@@ -0,0 +1,16 @@
|
|
1
|
+
svm_type c_svc
|
2
|
+
kernel_type rbf
|
3
|
+
gamma 0.0454545
|
4
|
+
nr_class 2
|
5
|
+
total_sv 7
|
6
|
+
rho -0.00785397
|
7
|
+
label 0 1
|
8
|
+
nr_sv 4 3
|
9
|
+
SV
|
10
|
+
1.178526450216892 0:1 1:1 2:1 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
|
11
|
+
0.8670862694825178 0:1 1:0 2:0 3:1 4:1 5:1 6:1 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
|
12
|
+
1.565208229773764 0:0 1:1 2:0 3:1 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:1 13:1 14:1 15:0 16:0 17:0 18:0 19:0 20:0 21:0
|
13
|
+
3.06895439364994 0:1 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:1 16:0 17:0 18:0 19:0 20:0 21:0
|
14
|
+
-2.445428417806817 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:1 10:1 11:1 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
|
15
|
+
-0.5364005807527658 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:1 10:1 11:1 12:0 13:0 14:0 15:0 16:1 17:1 18:1 19:0 20:0 21:0
|
16
|
+
-3.697946344563531 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:1 20:1 21:1
|
Binary file
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestHoatzin < Test::Unit::TestCase
|
4
|
+
|
5
|
+
context "An untrained Hoatzin classifier" do
|
6
|
+
|
7
|
+
setup do
|
8
|
+
@c = Hoatzin::Classifier.new()
|
9
|
+
end
|
10
|
+
|
11
|
+
should "support training and classification" do
|
12
|
+
assert_equal @c.train(:positive, "Thats nice"), [[1, 1]]
|
13
|
+
assert_equal @c.classify("Thats nice"), :positive
|
14
|
+
end
|
15
|
+
|
16
|
+
context "that has been trained" do
|
17
|
+
|
18
|
+
setup do
|
19
|
+
TRAINING_LABELS.each_with_index do |label, index|
|
20
|
+
@c.train(label, TRAINING_DOCS[index])
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
should "classify the test set correctly" do
|
25
|
+
#@c.save(:metadata => METADATA_FILE, :model => MODEL_FILE, :update => true)
|
26
|
+
TESTING_LABELS.each_with_index do |label, index|
|
27
|
+
assert_equal @c.classify(TESTING_DOCS[index]), label
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
should "return the classifications" do
|
32
|
+
assert_equal @c.classifications, [1,0]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
context "An untrained Hoatzin classifier with an un-updatable model" do
|
39
|
+
|
40
|
+
setup do
|
41
|
+
@c = Hoatzin::Classifier.new(:metadata => READONLY_METADATA_FILE, :model => READONLY_MODEL_FILE )
|
42
|
+
end
|
43
|
+
|
44
|
+
should "classify the test set correctly" do
|
45
|
+
TESTING_LABELS.each_with_index do |label, index|
|
46
|
+
assert_equal @c.classify(TESTING_DOCS[index]), label
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
should "not allow further training" do
|
51
|
+
#@c.train(:positive, "Thats nice")
|
52
|
+
assert_raises(Hoatzin::Classifier::ReadOnly) { @c.train(:positive, "Thats nice") }
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
context "An untrained Hoatzin classifier with an updatable model" do
|
58
|
+
|
59
|
+
setup do
|
60
|
+
@c = Hoatzin::Classifier.new(:metadata => METADATA_FILE, :model => MODEL_FILE )
|
61
|
+
end
|
62
|
+
|
63
|
+
should "classify the test set correctly" do
|
64
|
+
TESTING_LABELS.each_with_index do |label, index|
|
65
|
+
assert_equal @c.classify(TESTING_DOCS[index]), label
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
should "allow further training" do
|
70
|
+
assert_nothing_raised {@c.train :positive, "Thats nice" }
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
metadata
ADDED
@@ -0,0 +1,161 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hoatzin
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
version: 0.1.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- robl
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-12-31 00:00:00 +00:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: libsvm-ruby-swig
|
22
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
version: "0"
|
30
|
+
type: :runtime
|
31
|
+
prerelease: false
|
32
|
+
version_requirements: *id001
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: fast-stemmer
|
35
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
36
|
+
none: false
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
segments:
|
41
|
+
- 0
|
42
|
+
version: "0"
|
43
|
+
type: :runtime
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: *id002
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: shoulda
|
48
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
segments:
|
54
|
+
- 0
|
55
|
+
version: "0"
|
56
|
+
type: :development
|
57
|
+
prerelease: false
|
58
|
+
version_requirements: *id003
|
59
|
+
- !ruby/object:Gem::Dependency
|
60
|
+
name: bundler
|
61
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
63
|
+
requirements:
|
64
|
+
- - ~>
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
segments:
|
67
|
+
- 1
|
68
|
+
- 0
|
69
|
+
- 0
|
70
|
+
version: 1.0.0
|
71
|
+
type: :development
|
72
|
+
prerelease: false
|
73
|
+
version_requirements: *id004
|
74
|
+
- !ruby/object:Gem::Dependency
|
75
|
+
name: jeweler
|
76
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
77
|
+
none: false
|
78
|
+
requirements:
|
79
|
+
- - ~>
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
segments:
|
82
|
+
- 1
|
83
|
+
- 5
|
84
|
+
- 2
|
85
|
+
version: 1.5.2
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: *id005
|
89
|
+
- !ruby/object:Gem::Dependency
|
90
|
+
name: rcov
|
91
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
segments:
|
97
|
+
- 0
|
98
|
+
version: "0"
|
99
|
+
type: :development
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: *id006
|
102
|
+
description: Hoatzin is a text classifier in Ruby that uses SVM for it's classification.
|
103
|
+
email: robl@rjlee.net
|
104
|
+
executables: []
|
105
|
+
|
106
|
+
extensions: []
|
107
|
+
|
108
|
+
extra_rdoc_files:
|
109
|
+
- LICENSE.txt
|
110
|
+
- README.markdown
|
111
|
+
files:
|
112
|
+
- Gemfile
|
113
|
+
- Gemfile.lock
|
114
|
+
- LICENSE.txt
|
115
|
+
- README.markdown
|
116
|
+
- Rakefile
|
117
|
+
- VERSION
|
118
|
+
- hoatzin.gemspec
|
119
|
+
- lib/hoatzin.rb
|
120
|
+
- test/helper.rb
|
121
|
+
- test/models/readonly-test/metadata
|
122
|
+
- test/models/readonly-test/model
|
123
|
+
- test/models/test/metadata
|
124
|
+
- test/models/test/model
|
125
|
+
- test/test_hoatzin.rb
|
126
|
+
has_rdoc: true
|
127
|
+
homepage: http://github.com/rjlee/hoatzin
|
128
|
+
licenses:
|
129
|
+
- MIT
|
130
|
+
post_install_message:
|
131
|
+
rdoc_options: []
|
132
|
+
|
133
|
+
require_paths:
|
134
|
+
- lib
|
135
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
136
|
+
none: false
|
137
|
+
requirements:
|
138
|
+
- - ">="
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
hash: -1045663747
|
141
|
+
segments:
|
142
|
+
- 0
|
143
|
+
version: "0"
|
144
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
146
|
+
requirements:
|
147
|
+
- - ">="
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
segments:
|
150
|
+
- 0
|
151
|
+
version: "0"
|
152
|
+
requirements: []
|
153
|
+
|
154
|
+
rubyforge_project:
|
155
|
+
rubygems_version: 1.3.7
|
156
|
+
signing_key:
|
157
|
+
specification_version: 3
|
158
|
+
summary: SVM Classifier in Ruby
|
159
|
+
test_files:
|
160
|
+
- test/helper.rb
|
161
|
+
- test/test_hoatzin.rb
|