hoatzin 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ gem 'libsvm-ruby-swig'
7
+ gem 'fast-stemmer'
8
+
9
+ # Add dependencies to develop your gem here.
10
+ # Include everything needed to run rake, tests, features, etc.
11
+ group :development do
12
+ gem "shoulda", ">= 0"
13
+ gem "bundler", "~> 1.0.0"
14
+ gem "jeweler", "~> 1.5.2"
15
+ gem "rcov", ">= 0"
16
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,24 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ fast-stemmer (1.0.0)
5
+ git (1.2.5)
6
+ jeweler (1.5.2)
7
+ bundler (~> 1.0.0)
8
+ git (>= 1.2.5)
9
+ rake
10
+ libsvm-ruby-swig (0.4.0)
11
+ rake (0.8.7)
12
+ rcov (0.9.9)
13
+ shoulda (2.11.3)
14
+
15
+ PLATFORMS
16
+ ruby
17
+
18
+ DEPENDENCIES
19
+ bundler (~> 1.0.0)
20
+ fast-stemmer
21
+ jeweler (~> 1.5.2)
22
+ libsvm-ruby-swig
23
+ rcov
24
+ shoulda
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 robl
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.markdown ADDED
@@ -0,0 +1,48 @@
1
+ # hoatzin
2
+
3
+ Hoatzin is a text classifier in Ruby that uses SVM for it's classification.
4
+
5
+ ## Installation
6
+
7
+ gem install hoatzin
8
+
9
+ ## Usage
10
+
11
+ # Create a hoatzin classifier
12
+ c = Hoatzin::Classifier.new()
13
+
14
+ # Train the classifier with a classification and some text
15
+ c.train(:positive, "Thats nice")
16
+
17
+ # This will return the most likely classification (:positive)
18
+ c.classify("Thats nice")
19
+
20
+ ## Storage
21
+
22
+ The Hoatzin classifier supports saving your trained classifier to the filesystem. It needs
23
+ to store the libsvm model and the associated metadata as two separate files.
24
+
25
+ # Load a previously trained classifier
26
+ c = Hoatzin::Classifier.new(:metadata => '/path/to/file', :model => '/path/to/file')
27
+
28
+ # Save an existing trained classifier, without training data
29
+ # The #train method will raise an exception if called when the classifier is reloaded
30
+ c.save(:metadata => '/path/to/file', :model => '/path/to/file')
31
+
32
+ # Save an existing trained classifier, with training data
33
+ # The #train method can continue to be called when the classifier is reloaded
34
+ c.save(:metadata => '/path/to/file', :model => '/path/to/file', :update => true)
35
+
36
+ The classifier can continue to be trained if the model is saved with the :update => true option,
37
+ however the files stored on the filesystem will be much larger as they will contain copies
38
+ of all the documents used during training the classifier. It is generally advised to save without
39
+ the :update => true option unless it is definitely required.
40
+
41
+ ## Acknowledgements
42
+
43
+ See http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/ for the original inspiration.
44
+
45
+ ## Copyright and License
46
+
47
+ GPL v3 - See LICENSE.txt for details.
48
+ Copyright (c) 2010, Rob Lee
data/Rakefile ADDED
@@ -0,0 +1,53 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "hoatzin"
16
+ gem.homepage = "http://github.com/rjlee/hoatzin"
17
+ gem.license = "MIT"
18
+ gem.summary = %Q{SVM Classifier in Ruby}
19
+ gem.description = %Q{Hoatzin is a text classifier in Ruby that uses SVM for it's classification.}
20
+ gem.email = "robl@rjlee.net"
21
+ gem.authors = ["robl"]
22
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
23
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
24
+ # gem.add_runtime_dependency 'jabber4r', '> 0.1'
25
+ # gem.add_development_dependency 'rspec', '> 1.2.3'
26
+ end
27
+ Jeweler::RubygemsDotOrgTasks.new
28
+
29
+ require 'rake/testtask'
30
+ Rake::TestTask.new(:test) do |test|
31
+ test.libs << 'lib' << 'test'
32
+ test.pattern = 'test/**/test_*.rb'
33
+ test.verbose = true
34
+ end
35
+
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/test_*.rb'
40
+ test.verbose = true
41
+ end
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
+
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "hoatzin #{version}"
51
+ rdoc.rdoc_files.include('README*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/hoatzin.gemspec ADDED
@@ -0,0 +1,73 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{hoatzin}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["robl"]
12
+ s.date = %q{2010-12-31}
13
+ s.description = %q{Hoatzin is a text classifier in Ruby that uses SVM for it's classification.}
14
+ s.email = %q{robl@rjlee.net}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.markdown"
18
+ ]
19
+ s.files = [
20
+ "Gemfile",
21
+ "Gemfile.lock",
22
+ "LICENSE.txt",
23
+ "README.markdown",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "hoatzin.gemspec",
27
+ "lib/hoatzin.rb",
28
+ "test/helper.rb",
29
+ "test/models/readonly-test/metadata",
30
+ "test/models/readonly-test/model",
31
+ "test/models/test/metadata",
32
+ "test/models/test/model",
33
+ "test/test_hoatzin.rb"
34
+ ]
35
+ s.homepage = %q{http://github.com/rjlee/hoatzin}
36
+ s.licenses = ["MIT"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = %q{1.3.7}
39
+ s.summary = %q{SVM Classifier in Ruby}
40
+ s.test_files = [
41
+ "test/helper.rb",
42
+ "test/test_hoatzin.rb"
43
+ ]
44
+
45
+ if s.respond_to? :specification_version then
46
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
47
+ s.specification_version = 3
48
+
49
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
50
+ s.add_runtime_dependency(%q<libsvm-ruby-swig>, [">= 0"])
51
+ s.add_runtime_dependency(%q<fast-stemmer>, [">= 0"])
52
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
53
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
54
+ s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
55
+ s.add_development_dependency(%q<rcov>, [">= 0"])
56
+ else
57
+ s.add_dependency(%q<libsvm-ruby-swig>, [">= 0"])
58
+ s.add_dependency(%q<fast-stemmer>, [">= 0"])
59
+ s.add_dependency(%q<shoulda>, [">= 0"])
60
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
61
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
62
+ s.add_dependency(%q<rcov>, [">= 0"])
63
+ end
64
+ else
65
+ s.add_dependency(%q<libsvm-ruby-swig>, [">= 0"])
66
+ s.add_dependency(%q<fast-stemmer>, [">= 0"])
67
+ s.add_dependency(%q<shoulda>, [">= 0"])
68
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
69
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
70
+ s.add_dependency(%q<rcov>, [">= 0"])
71
+ end
72
+ end
73
+
data/lib/hoatzin.rb ADDED
@@ -0,0 +1,189 @@
1
+ require 'svm'
2
+ require 'fast_stemmer'
3
+ require 'iconv'
4
+
5
+ module Hoatzin
6
+ class Classifier
7
+
8
+ class ReadOnly < Exception; end
9
+
10
+ attr_reader :classifications
11
+
12
+ def initialize options = {}
13
+
14
+ @metadata_file = options.delete(:metadata) || nil
15
+ @model_file = options.delete(:model) || nil
16
+ @documents = []
17
+ @dictionary = []
18
+ @classifications = []
19
+ @labels = []
20
+ @feature_vectors = []
21
+ @problem = @model = nil
22
+ @cache = 0
23
+ @readonly = false
24
+
25
+ # If we have model and metadata files then load them
26
+ load if @metadata_file && @model_file
27
+
28
+ # Define kernel parameters for libsvm
29
+ @parameters = Parameter.new(:C => 100,
30
+ :degree => 1,
31
+ :coef0 => 0,
32
+ :eps => 0.001)
33
+
34
+ end
35
+
36
+ def train classification, text
37
+ # Only allow retraining if we have all the required data
38
+ raise ReadOnly if @readonly
39
+
40
+ # Add the classification if we haven't seen it before
41
+ @classifications << classification unless @classifications.include?(classification)
42
+
43
+ # Tokenize the text
44
+ tokens = Classifier.tokenize(text)
45
+
46
+ # Add tokens to word list
47
+ @dictionary << tokens
48
+ @dictionary.flatten!.uniq!
49
+
50
+ # Add to list of documents
51
+ @documents << tokens
52
+
53
+ # Add classification to classification list
54
+ @labels << @classifications.index(classification)
55
+
56
+ # Compute the feature vectors
57
+ @feature_vectors = @documents.map { |doc| @dictionary.map{|x| doc.include?(x) ? 1 : 0} }
58
+ end
59
+
60
+ def classify text
61
+ # Only update the model if we've trained more documents since it was last updated
62
+ if !@readonly && @documents.length > @cache
63
+ return nil if @documents.length == 0
64
+ @cache = @documents.length
65
+ assign_model
66
+ end
67
+
68
+ # Tokenize the text
69
+ tokens = Classifier.tokenize(text)
70
+
71
+ # Calculate the feature vectors for the text to be classified
72
+ f_vector = @dictionary.map{|x| tokens.include?(x) ? 1 : 0}
73
+
74
+ # Classify and return classification
75
+ pred, probs = @model.predict_probability(f_vector)
76
+ @classifications[pred.to_i]
77
+ end
78
+
79
+ def save options = {}
80
+ @metadata_file = options[:metadata] if options.key?(:metadata)
81
+ @model_file = options[:model] if options.key?(:model)
82
+ return false unless (@metadata_file && @model_file)
83
+ data = { :dictionary => @dictionary, :classifications => @classifications}
84
+ data.merge!(:documents => @documents,
85
+ :labels => @labels,
86
+ :feature_vectors => @feature_vectors,
87
+ :cache => @cache) if options[:update]
88
+ File.open(@metadata_file, 'w+') { |f| Marshal.dump(data, f) }
89
+ assign_model if @model.nil?
90
+ @model.save(@model_file)
91
+ end
92
+
93
+ protected
94
+ def load
95
+ data = {}
96
+ File.open(@metadata_file) { |f| data = Marshal.load(f) }
97
+ @dictionary = data[:dictionary]
98
+ @classifications = data[:classifications]
99
+ if data.key?(:documents)
100
+ @documents = data[:documents]
101
+ @labels = data[:labels]
102
+ @feature_vectors = data[:feature_vectors]
103
+ @cache = data[:cache]
104
+ end
105
+ @readonly = @documents.length > 0 ? false : true
106
+ @model = Model.new(@model_file)
107
+ end
108
+
109
+ def assign_model
110
+ @problem = Problem.new(@labels, @feature_vectors)
111
+ @model = Model.new(@problem, @parameters)
112
+ end
113
+
114
+ # Adapted from ankusa, to replace with tokenizer gem
115
+ def self.tokenize text
116
+ tokens = []
117
+ # from http://www.jroller.com/obie/tags/unicode
118
+ converter = Iconv.new('ASCII//IGNORE//TRANSLIT', 'UTF-8')
119
+ converter.iconv(text).unpack('U*').select { |cp| cp < 127 }.pack('U*') rescue ""
120
+ text.tr('-', ' ').gsub(/[^\w\s]/," ").split.each do |token|
121
+ tokens << token if (token.length > 3 && !Classifier.stop_words.include?(token))
122
+ end
123
+ tokens
124
+ end
125
+
126
+ # ftp://ftp.cs.cornell.edu/pub/smart/english.stop
127
+ def self.stop_words
128
+ %w{
129
+ a a's able about above according accordingly across actually after
130
+ afterwards again against ain't all allow allows almost alone along
131
+ already also although always am among amongst an and another any
132
+ anybody anyhow anyone anything anyway anyways anywhere apart appear
133
+ appreciate appropriate are aren't around as aside ask asking
134
+ associated at available away awfully b be became because become
135
+ becomes becoming been before beforehand behind being believe below
136
+ beside besides best better between beyond both brief but by c
137
+ c'mon c's came can can't cannot cant cause causes certain certainly
138
+ changes clearly co com come comes concerning consequently consider
139
+ considering contain containing contains corresponding could couldn't
140
+ course currently d definitely described despite did didn't different
141
+ do does doesn't doing don't done down downwards during e each edu
142
+ eg eight either else elsewhere enough entirely especially et etc
143
+ even ever every everybody everyone everything everywhere ex exactly
144
+ example except f far few fifth first five followed following follows
145
+ for former formerly forth four from further furthermore g get gets
146
+ getting given gives go goes going gone got gotten greetings h had
147
+ hadn't happens hardly has hasn't have haven't having he he's hello
148
+ help hence her here here's hereafter hereby herein hereupon hers
149
+ herself hi him himself his hither hopefully how howbeit however i
150
+ i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed
151
+ indicate indicated indicates inner insofar instead into inward is
152
+ isn't it it'd it'll it's its itself j just k keep keeps kept know
153
+ knows known l last lately later latter latterly least less lest let
154
+ let's like liked likely little look looking looks ltd m mainly many
155
+ may maybe me mean meanwhile merely might more moreover most mostly
156
+ much must my myself n name namely nd near nearly necessary need needs
157
+ neither never nevertheless new next nine no nobody non none noone
158
+ nor normally not nothing novel now nowhere o obviously of off often
159
+ oh ok okay old on once one ones only onto or other others otherwise
160
+ ought our ours ourselves out outside over overall own p particular
161
+ particularly per perhaps placed please plus possible presumably
162
+ probably provides q que quite qv r rather rd re really reasonably
163
+ regarding regardless regards relatively respectively right s said
164
+ same saw say saying says second secondly see seeing seem seemed
165
+ seeming seems seen self selves sensible sent serious seriously
166
+ seven several shall she should shouldn't since six so some somebody
167
+ somehow someone something sometime sometimes somewhat somewhere soon
168
+ sorry specified specify specifying still sub such sup sure t t's
169
+ take taken tell tends th than thank thanks thanx that that's thats
170
+ the their theirs them themselves then thence there there's thereafter
171
+ thereby therefore therein theres thereupon these they they'd they'll
172
+ they're they've think third this thorough thoroughly those though
173
+ three through throughout thru thus to together too took toward
174
+ towards tried tries truly try trying twice two u un under
175
+ unfortunately unless unlikely until unto up upon us use used useful
176
+ uses using usually uucp v value various very via viz vs w want wants
177
+ was wasn't way we we'd we'll we're we've welcome well went were weren't
178
+ what what's whatever when whence whenever where where's whereafter
179
+ whereas whereby wherein whereupon wherever whether which while
180
+ whither who who's whoever whole whom whose why will willing wish
181
+ with within without won't wonder would would wouldn't x y yes yet
182
+ you you'd you'll you're you've your yours yourself yourselves
183
+ z zero
184
+ }
185
+ end
186
+
187
+
188
+ end
189
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,43 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'hoatzin'
16
+
17
+ class Test::Unit::TestCase
18
+
19
+ TRAINING_LABELS = [1, 1, 0, 1, 1, 0, 0]
20
+ TRAINING_DOCS = [
21
+ 'FREE NATIONAL TREASURE',
22
+ 'FREE TV for EVERY visitor',
23
+ 'Peter and Stewie are hilarious',
24
+ 'AS SEEN ON NATIONAL TV',
25
+ 'FREE drugs',
26
+ 'New episode rocks, Peter and Stewie are hilarious',
27
+ 'Peter is my fav!'
28
+ # ...
29
+ ]
30
+
31
+ TESTING_LABELS = [1, 0, 0]
32
+ TESTING_DOCS = [
33
+ 'FREE lotterry for the NATIONAL TREASURE !!!',
34
+ 'Stewie is hilarious',
35
+ 'Poor Peter ... hilarious',
36
+ # ...
37
+ ]
38
+
39
+ READONLY_METADATA_FILE = File.join(File.dirname(__FILE__), 'models', 'readonly-test', 'model')
40
+ READONLY_MODEL_FILE = File.join(File.dirname(__FILE__), 'models', 'readonly-test', 'metadata')
41
+ METADATA_FILE = File.join(File.dirname(__FILE__), 'models', 'test', 'model')
42
+ MODEL_FILE = File.join(File.dirname(__FILE__), 'models', 'test', 'metadata')
43
+ end
@@ -0,0 +1,16 @@
1
+ svm_type c_svc
2
+ kernel_type rbf
3
+ gamma 0.0454545
4
+ nr_class 2
5
+ total_sv 7
6
+ rho -0.00785397
7
+ label 0 1
8
+ nr_sv 4 3
9
+ SV
10
+ 1.178526450216892 0:1 1:1 2:1 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
11
+ 0.8670862694825178 0:1 1:0 2:0 3:1 4:1 5:1 6:1 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
12
+ 1.565208229773764 0:0 1:1 2:0 3:1 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:1 13:1 14:1 15:0 16:0 17:0 18:0 19:0 20:0 21:0
13
+ 3.06895439364994 0:1 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:1 16:0 17:0 18:0 19:0 20:0 21:0
14
+ -2.445428417806817 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:1 10:1 11:1 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
15
+ -0.5364005807527658 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:1 10:1 11:1 12:0 13:0 14:0 15:0 16:1 17:1 18:1 19:0 20:0 21:0
16
+ -3.697946344563531 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:1 20:1 21:1
Binary file
@@ -0,0 +1,16 @@
1
+ svm_type c_svc
2
+ kernel_type rbf
3
+ gamma 0.0454545
4
+ nr_class 2
5
+ total_sv 7
6
+ rho -0.00785397
7
+ label 0 1
8
+ nr_sv 4 3
9
+ SV
10
+ 1.178526450216892 0:1 1:1 2:1 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
11
+ 0.8670862694825178 0:1 1:0 2:0 3:1 4:1 5:1 6:1 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
12
+ 1.565208229773764 0:0 1:1 2:0 3:1 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:1 13:1 14:1 15:0 16:0 17:0 18:0 19:0 20:0 21:0
13
+ 3.06895439364994 0:1 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:1 16:0 17:0 18:0 19:0 20:0 21:0
14
+ -2.445428417806817 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:1 10:1 11:1 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
15
+ -0.5364005807527658 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:1 10:1 11:1 12:0 13:0 14:0 15:0 16:1 17:1 18:1 19:0 20:0 21:0
16
+ -3.697946344563531 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:1 20:1 21:1
Binary file
@@ -0,0 +1,74 @@
1
+ require 'helper'
2
+
3
+ class TestHoatzin < Test::Unit::TestCase
4
+
5
+ context "An untrained Hoatzin classifier" do
6
+
7
+ setup do
8
+ @c = Hoatzin::Classifier.new()
9
+ end
10
+
11
+ should "support training and classification" do
12
+ assert_equal @c.train(:positive, "Thats nice"), [[1, 1]]
13
+ assert_equal @c.classify("Thats nice"), :positive
14
+ end
15
+
16
+ context "that has been trained" do
17
+
18
+ setup do
19
+ TRAINING_LABELS.each_with_index do |label, index|
20
+ @c.train(label, TRAINING_DOCS[index])
21
+ end
22
+ end
23
+
24
+ should "classify the test set correctly" do
25
+ #@c.save(:metadata => METADATA_FILE, :model => MODEL_FILE, :update => true)
26
+ TESTING_LABELS.each_with_index do |label, index|
27
+ assert_equal @c.classify(TESTING_DOCS[index]), label
28
+ end
29
+ end
30
+
31
+ should "return the classifications" do
32
+ assert_equal @c.classifications, [1,0]
33
+ end
34
+ end
35
+
36
+ end
37
+
38
+ context "An untrained Hoatzin classifier with an un-updatable model" do
39
+
40
+ setup do
41
+ @c = Hoatzin::Classifier.new(:metadata => READONLY_METADATA_FILE, :model => READONLY_MODEL_FILE )
42
+ end
43
+
44
+ should "classify the test set correctly" do
45
+ TESTING_LABELS.each_with_index do |label, index|
46
+ assert_equal @c.classify(TESTING_DOCS[index]), label
47
+ end
48
+ end
49
+
50
+ should "not allow further training" do
51
+ #@c.train(:positive, "Thats nice")
52
+ assert_raises(Hoatzin::Classifier::ReadOnly) { @c.train(:positive, "Thats nice") }
53
+ end
54
+
55
+ end
56
+
57
+ context "An untrained Hoatzin classifier with an updatable model" do
58
+
59
+ setup do
60
+ @c = Hoatzin::Classifier.new(:metadata => METADATA_FILE, :model => MODEL_FILE )
61
+ end
62
+
63
+ should "classify the test set correctly" do
64
+ TESTING_LABELS.each_with_index do |label, index|
65
+ assert_equal @c.classify(TESTING_DOCS[index]), label
66
+ end
67
+ end
68
+
69
+ should "allow further training" do
70
+ assert_nothing_raised {@c.train :positive, "Thats nice" }
71
+ end
72
+
73
+ end
74
+ end
metadata ADDED
@@ -0,0 +1,161 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hoatzin
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - robl
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-12-31 00:00:00 +00:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: libsvm-ruby-swig
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :runtime
31
+ prerelease: false
32
+ version_requirements: *id001
33
+ - !ruby/object:Gem::Dependency
34
+ name: fast-stemmer
35
+ requirement: &id002 !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ segments:
41
+ - 0
42
+ version: "0"
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: *id002
46
+ - !ruby/object:Gem::Dependency
47
+ name: shoulda
48
+ requirement: &id003 !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ segments:
54
+ - 0
55
+ version: "0"
56
+ type: :development
57
+ prerelease: false
58
+ version_requirements: *id003
59
+ - !ruby/object:Gem::Dependency
60
+ name: bundler
61
+ requirement: &id004 !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ~>
65
+ - !ruby/object:Gem::Version
66
+ segments:
67
+ - 1
68
+ - 0
69
+ - 0
70
+ version: 1.0.0
71
+ type: :development
72
+ prerelease: false
73
+ version_requirements: *id004
74
+ - !ruby/object:Gem::Dependency
75
+ name: jeweler
76
+ requirement: &id005 !ruby/object:Gem::Requirement
77
+ none: false
78
+ requirements:
79
+ - - ~>
80
+ - !ruby/object:Gem::Version
81
+ segments:
82
+ - 1
83
+ - 5
84
+ - 2
85
+ version: 1.5.2
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: *id005
89
+ - !ruby/object:Gem::Dependency
90
+ name: rcov
91
+ requirement: &id006 !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ segments:
97
+ - 0
98
+ version: "0"
99
+ type: :development
100
+ prerelease: false
101
+ version_requirements: *id006
102
+ description: Hoatzin is a text classifier in Ruby that uses SVM for it's classification.
103
+ email: robl@rjlee.net
104
+ executables: []
105
+
106
+ extensions: []
107
+
108
+ extra_rdoc_files:
109
+ - LICENSE.txt
110
+ - README.markdown
111
+ files:
112
+ - Gemfile
113
+ - Gemfile.lock
114
+ - LICENSE.txt
115
+ - README.markdown
116
+ - Rakefile
117
+ - VERSION
118
+ - hoatzin.gemspec
119
+ - lib/hoatzin.rb
120
+ - test/helper.rb
121
+ - test/models/readonly-test/metadata
122
+ - test/models/readonly-test/model
123
+ - test/models/test/metadata
124
+ - test/models/test/model
125
+ - test/test_hoatzin.rb
126
+ has_rdoc: true
127
+ homepage: http://github.com/rjlee/hoatzin
128
+ licenses:
129
+ - MIT
130
+ post_install_message:
131
+ rdoc_options: []
132
+
133
+ require_paths:
134
+ - lib
135
+ required_ruby_version: !ruby/object:Gem::Requirement
136
+ none: false
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ hash: -1045663747
141
+ segments:
142
+ - 0
143
+ version: "0"
144
+ required_rubygems_version: !ruby/object:Gem::Requirement
145
+ none: false
146
+ requirements:
147
+ - - ">="
148
+ - !ruby/object:Gem::Version
149
+ segments:
150
+ - 0
151
+ version: "0"
152
+ requirements: []
153
+
154
+ rubyforge_project:
155
+ rubygems_version: 1.3.7
156
+ signing_key:
157
+ specification_version: 3
158
+ summary: SVM Classifier in Ruby
159
+ test_files:
160
+ - test/helper.rb
161
+ - test/test_hoatzin.rb