hoatzin 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ gem 'libsvm-ruby-swig'
7
+ gem 'fast-stemmer'
8
+
9
+ # Add dependencies to develop your gem here.
10
+ # Include everything needed to run rake, tests, features, etc.
11
+ group :development do
12
+ gem "shoulda", ">= 0"
13
+ gem "bundler", "~> 1.0.0"
14
+ gem "jeweler", "~> 1.5.2"
15
+ gem "rcov", ">= 0"
16
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,24 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ fast-stemmer (1.0.0)
5
+ git (1.2.5)
6
+ jeweler (1.5.2)
7
+ bundler (~> 1.0.0)
8
+ git (>= 1.2.5)
9
+ rake
10
+ libsvm-ruby-swig (0.4.0)
11
+ rake (0.8.7)
12
+ rcov (0.9.9)
13
+ shoulda (2.11.3)
14
+
15
+ PLATFORMS
16
+ ruby
17
+
18
+ DEPENDENCIES
19
+ bundler (~> 1.0.0)
20
+ fast-stemmer
21
+ jeweler (~> 1.5.2)
22
+ libsvm-ruby-swig
23
+ rcov
24
+ shoulda
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 robl
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.markdown ADDED
@@ -0,0 +1,48 @@
1
+ # hoatzin
2
+
3
+ Hoatzin is a text classifier in Ruby that uses SVM for it's classification.
4
+
5
+ ## Installation
6
+
7
+ gem install hoatzin
8
+
9
+ ## Usage
10
+
11
+ # Create a hoatzin classifier
12
+ c = Hoatzin::Classifier.new()
13
+
14
+ # Train the classifier with a classification and some text
15
+ c.train(:positive, "Thats nice")
16
+
17
+ # This will return the most likely classification (:positive)
18
+ c.classify("Thats nice")
19
+
20
+ ## Storage
21
+
22
+ The Hoatzin classifier supports saving your trained classifier to the filesystem. It needs
23
+ to store the libsvm model and the associated metadata as two separate files.
24
+
25
+ # Load a previously trained classifier
26
+ c = Hoatzin::Classifier.new(:metadata => '/path/to/file', :model => '/path/to/file')
27
+
28
+ # Save an existing trained classifier, without training data
29
+ # The #train method will raise an exception if called when the classifier is reloaded
30
+ c.save(:metadata => '/path/to/file', :model => '/path/to/file')
31
+
32
+ # Save an existing trained classifier, with training data
33
+ # The #train method can continue to be called when the classifier is reloaded
34
+ c.save(:metadata => '/path/to/file', :model => '/path/to/file', :update => true)
35
+
36
+ The classifier can continue to be trained if the model is saved with the :update => true option,
37
+ however the files stored on the filesystem will be much larger as they will contain copies
38
+ of all the documents used during training the classifier. It is generally advised to save without
39
+ the :update => true option unless it is definitely required.
40
+
41
+ ## Acknowledgements
42
+
43
+ See http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/ for the original inspiration.
44
+
45
+ ## Copyright and License
46
+
47
+ GPL v3 - See LICENSE.txt for details.
48
+ Copyright (c) 2010, Rob Lee
data/Rakefile ADDED
@@ -0,0 +1,53 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "hoatzin"
16
+ gem.homepage = "http://github.com/rjlee/hoatzin"
17
+ gem.license = "MIT"
18
+ gem.summary = %Q{SVM Classifier in Ruby}
19
+ gem.description = %Q{Hoatzin is a text classifier in Ruby that uses SVM for it's classification.}
20
+ gem.email = "robl@rjlee.net"
21
+ gem.authors = ["robl"]
22
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
23
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
24
+ # gem.add_runtime_dependency 'jabber4r', '> 0.1'
25
+ # gem.add_development_dependency 'rspec', '> 1.2.3'
26
+ end
27
+ Jeweler::RubygemsDotOrgTasks.new
28
+
29
+ require 'rake/testtask'
30
+ Rake::TestTask.new(:test) do |test|
31
+ test.libs << 'lib' << 'test'
32
+ test.pattern = 'test/**/test_*.rb'
33
+ test.verbose = true
34
+ end
35
+
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/test_*.rb'
40
+ test.verbose = true
41
+ end
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
+
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "hoatzin #{version}"
51
+ rdoc.rdoc_files.include('README*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/hoatzin.gemspec ADDED
@@ -0,0 +1,73 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{hoatzin}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["robl"]
12
+ s.date = %q{2010-12-31}
13
+ s.description = %q{Hoatzin is a text classifier in Ruby that uses SVM for it's classification.}
14
+ s.email = %q{robl@rjlee.net}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.markdown"
18
+ ]
19
+ s.files = [
20
+ "Gemfile",
21
+ "Gemfile.lock",
22
+ "LICENSE.txt",
23
+ "README.markdown",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "hoatzin.gemspec",
27
+ "lib/hoatzin.rb",
28
+ "test/helper.rb",
29
+ "test/models/readonly-test/metadata",
30
+ "test/models/readonly-test/model",
31
+ "test/models/test/metadata",
32
+ "test/models/test/model",
33
+ "test/test_hoatzin.rb"
34
+ ]
35
+ s.homepage = %q{http://github.com/rjlee/hoatzin}
36
+ s.licenses = ["MIT"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = %q{1.3.7}
39
+ s.summary = %q{SVM Classifier in Ruby}
40
+ s.test_files = [
41
+ "test/helper.rb",
42
+ "test/test_hoatzin.rb"
43
+ ]
44
+
45
+ if s.respond_to? :specification_version then
46
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
47
+ s.specification_version = 3
48
+
49
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
50
+ s.add_runtime_dependency(%q<libsvm-ruby-swig>, [">= 0"])
51
+ s.add_runtime_dependency(%q<fast-stemmer>, [">= 0"])
52
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
53
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
54
+ s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
55
+ s.add_development_dependency(%q<rcov>, [">= 0"])
56
+ else
57
+ s.add_dependency(%q<libsvm-ruby-swig>, [">= 0"])
58
+ s.add_dependency(%q<fast-stemmer>, [">= 0"])
59
+ s.add_dependency(%q<shoulda>, [">= 0"])
60
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
61
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
62
+ s.add_dependency(%q<rcov>, [">= 0"])
63
+ end
64
+ else
65
+ s.add_dependency(%q<libsvm-ruby-swig>, [">= 0"])
66
+ s.add_dependency(%q<fast-stemmer>, [">= 0"])
67
+ s.add_dependency(%q<shoulda>, [">= 0"])
68
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
69
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
70
+ s.add_dependency(%q<rcov>, [">= 0"])
71
+ end
72
+ end
73
+
data/lib/hoatzin.rb ADDED
@@ -0,0 +1,189 @@
1
+ require 'svm'
2
+ require 'fast_stemmer'
3
+ require 'iconv'
4
+
5
+ module Hoatzin
6
+ class Classifier
7
+
8
+ class ReadOnly < Exception; end
9
+
10
+ attr_reader :classifications
11
+
12
+ def initialize options = {}
13
+
14
+ @metadata_file = options.delete(:metadata) || nil
15
+ @model_file = options.delete(:model) || nil
16
+ @documents = []
17
+ @dictionary = []
18
+ @classifications = []
19
+ @labels = []
20
+ @feature_vectors = []
21
+ @problem = @model = nil
22
+ @cache = 0
23
+ @readonly = false
24
+
25
+ # If we have model and metadata files then load them
26
+ load if @metadata_file && @model_file
27
+
28
+ # Define kernel parameters for libsvm
29
+ @parameters = Parameter.new(:C => 100,
30
+ :degree => 1,
31
+ :coef0 => 0,
32
+ :eps => 0.001)
33
+
34
+ end
35
+
36
+ def train classification, text
37
+ # Only allow retraining if we have all the required data
38
+ raise ReadOnly if @readonly
39
+
40
+ # Add the classification if we haven't seen it before
41
+ @classifications << classification unless @classifications.include?(classification)
42
+
43
+ # Tokenize the text
44
+ tokens = Classifier.tokenize(text)
45
+
46
+ # Add tokens to word list
47
+ @dictionary << tokens
48
+ @dictionary.flatten!.uniq!
49
+
50
+ # Add to list of documents
51
+ @documents << tokens
52
+
53
+ # Add classification to classification list
54
+ @labels << @classifications.index(classification)
55
+
56
+ # Compute the feature vectors
57
+ @feature_vectors = @documents.map { |doc| @dictionary.map{|x| doc.include?(x) ? 1 : 0} }
58
+ end
59
+
60
+ def classify text
61
+ # Only update the model if we've trained more documents since it was last updated
62
+ if !@readonly && @documents.length > @cache
63
+ return nil if @documents.length == 0
64
+ @cache = @documents.length
65
+ assign_model
66
+ end
67
+
68
+ # Tokenize the text
69
+ tokens = Classifier.tokenize(text)
70
+
71
+ # Calculate the feature vectors for the text to be classified
72
+ f_vector = @dictionary.map{|x| tokens.include?(x) ? 1 : 0}
73
+
74
+ # Classify and return classification
75
+ pred, probs = @model.predict_probability(f_vector)
76
+ @classifications[pred.to_i]
77
+ end
78
+
79
+ def save options = {}
80
+ @metadata_file = options[:metadata] if options.key?(:metadata)
81
+ @model_file = options[:model] if options.key?(:model)
82
+ return false unless (@metadata_file && @model_file)
83
+ data = { :dictionary => @dictionary, :classifications => @classifications}
84
+ data.merge!(:documents => @documents,
85
+ :labels => @labels,
86
+ :feature_vectors => @feature_vectors,
87
+ :cache => @cache) if options[:update]
88
+ File.open(@metadata_file, 'w+') { |f| Marshal.dump(data, f) }
89
+ assign_model if @model.nil?
90
+ @model.save(@model_file)
91
+ end
92
+
93
+ protected
94
+ def load
95
+ data = {}
96
+ File.open(@metadata_file) { |f| data = Marshal.load(f) }
97
+ @dictionary = data[:dictionary]
98
+ @classifications = data[:classifications]
99
+ if data.key?(:documents)
100
+ @documents = data[:documents]
101
+ @labels = data[:labels]
102
+ @feature_vectors = data[:feature_vectors]
103
+ @cache = data[:cache]
104
+ end
105
+ @readonly = @documents.length > 0 ? false : true
106
+ @model = Model.new(@model_file)
107
+ end
108
+
109
+ def assign_model
110
+ @problem = Problem.new(@labels, @feature_vectors)
111
+ @model = Model.new(@problem, @parameters)
112
+ end
113
+
114
+ # Adapted from ankusa, to replace with tokenizer gem
115
+ def self.tokenize text
116
+ tokens = []
117
+ # from http://www.jroller.com/obie/tags/unicode
118
+ converter = Iconv.new('ASCII//IGNORE//TRANSLIT', 'UTF-8')
119
+ converter.iconv(text).unpack('U*').select { |cp| cp < 127 }.pack('U*') rescue ""
120
+ text.tr('-', ' ').gsub(/[^\w\s]/," ").split.each do |token|
121
+ tokens << token if (token.length > 3 && !Classifier.stop_words.include?(token))
122
+ end
123
+ tokens
124
+ end
125
+
126
+ # ftp://ftp.cs.cornell.edu/pub/smart/english.stop
127
+ def self.stop_words
128
+ %w{
129
+ a a's able about above according accordingly across actually after
130
+ afterwards again against ain't all allow allows almost alone along
131
+ already also although always am among amongst an and another any
132
+ anybody anyhow anyone anything anyway anyways anywhere apart appear
133
+ appreciate appropriate are aren't around as aside ask asking
134
+ associated at available away awfully b be became because become
135
+ becomes becoming been before beforehand behind being believe below
136
+ beside besides best better between beyond both brief but by c
137
+ c'mon c's came can can't cannot cant cause causes certain certainly
138
+ changes clearly co com come comes concerning consequently consider
139
+ considering contain containing contains corresponding could couldn't
140
+ course currently d definitely described despite did didn't different
141
+ do does doesn't doing don't done down downwards during e each edu
142
+ eg eight either else elsewhere enough entirely especially et etc
143
+ even ever every everybody everyone everything everywhere ex exactly
144
+ example except f far few fifth first five followed following follows
145
+ for former formerly forth four from further furthermore g get gets
146
+ getting given gives go goes going gone got gotten greetings h had
147
+ hadn't happens hardly has hasn't have haven't having he he's hello
148
+ help hence her here here's hereafter hereby herein hereupon hers
149
+ herself hi him himself his hither hopefully how howbeit however i
150
+ i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed
151
+ indicate indicated indicates inner insofar instead into inward is
152
+ isn't it it'd it'll it's its itself j just k keep keeps kept know
153
+ knows known l last lately later latter latterly least less lest let
154
+ let's like liked likely little look looking looks ltd m mainly many
155
+ may maybe me mean meanwhile merely might more moreover most mostly
156
+ much must my myself n name namely nd near nearly necessary need needs
157
+ neither never nevertheless new next nine no nobody non none noone
158
+ nor normally not nothing novel now nowhere o obviously of off often
159
+ oh ok okay old on once one ones only onto or other others otherwise
160
+ ought our ours ourselves out outside over overall own p particular
161
+ particularly per perhaps placed please plus possible presumably
162
+ probably provides q que quite qv r rather rd re really reasonably
163
+ regarding regardless regards relatively respectively right s said
164
+ same saw say saying says second secondly see seeing seem seemed
165
+ seeming seems seen self selves sensible sent serious seriously
166
+ seven several shall she should shouldn't since six so some somebody
167
+ somehow someone something sometime sometimes somewhat somewhere soon
168
+ sorry specified specify specifying still sub such sup sure t t's
169
+ take taken tell tends th than thank thanks thanx that that's thats
170
+ the their theirs them themselves then thence there there's thereafter
171
+ thereby therefore therein theres thereupon these they they'd they'll
172
+ they're they've think third this thorough thoroughly those though
173
+ three through throughout thru thus to together too took toward
174
+ towards tried tries truly try trying twice two u un under
175
+ unfortunately unless unlikely until unto up upon us use used useful
176
+ uses using usually uucp v value various very via viz vs w want wants
177
+ was wasn't way we we'd we'll we're we've welcome well went were weren't
178
+ what what's whatever when whence whenever where where's whereafter
179
+ whereas whereby wherein whereupon wherever whether which while
180
+ whither who who's whoever whole whom whose why will willing wish
181
+ with within without won't wonder would would wouldn't x y yes yet
182
+ you you'd you'll you're you've your yours yourself yourselves
183
+ z zero
184
+ }
185
+ end
186
+
187
+
188
+ end
189
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,43 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'hoatzin'
16
+
17
+ class Test::Unit::TestCase
18
+
19
+ TRAINING_LABELS = [1, 1, 0, 1, 1, 0, 0]
20
+ TRAINING_DOCS = [
21
+ 'FREE NATIONAL TREASURE',
22
+ 'FREE TV for EVERY visitor',
23
+ 'Peter and Stewie are hilarious',
24
+ 'AS SEEN ON NATIONAL TV',
25
+ 'FREE drugs',
26
+ 'New episode rocks, Peter and Stewie are hilarious',
27
+ 'Peter is my fav!'
28
+ # ...
29
+ ]
30
+
31
+ TESTING_LABELS = [1, 0, 0]
32
+ TESTING_DOCS = [
33
+ 'FREE lotterry for the NATIONAL TREASURE !!!',
34
+ 'Stewie is hilarious',
35
+ 'Poor Peter ... hilarious',
36
+ # ...
37
+ ]
38
+
39
+ READONLY_METADATA_FILE = File.join(File.dirname(__FILE__), 'models', 'readonly-test', 'model')
40
+ READONLY_MODEL_FILE = File.join(File.dirname(__FILE__), 'models', 'readonly-test', 'metadata')
41
+ METADATA_FILE = File.join(File.dirname(__FILE__), 'models', 'test', 'model')
42
+ MODEL_FILE = File.join(File.dirname(__FILE__), 'models', 'test', 'metadata')
43
+ end
@@ -0,0 +1,16 @@
1
+ svm_type c_svc
2
+ kernel_type rbf
3
+ gamma 0.0454545
4
+ nr_class 2
5
+ total_sv 7
6
+ rho -0.00785397
7
+ label 0 1
8
+ nr_sv 4 3
9
+ SV
10
+ 1.178526450216892 0:1 1:1 2:1 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
11
+ 0.8670862694825178 0:1 1:0 2:0 3:1 4:1 5:1 6:1 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
12
+ 1.565208229773764 0:0 1:1 2:0 3:1 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:1 13:1 14:1 15:0 16:0 17:0 18:0 19:0 20:0 21:0
13
+ 3.06895439364994 0:1 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:1 16:0 17:0 18:0 19:0 20:0 21:0
14
+ -2.445428417806817 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:1 10:1 11:1 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
15
+ -0.5364005807527658 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:1 10:1 11:1 12:0 13:0 14:0 15:0 16:1 17:1 18:1 19:0 20:0 21:0
16
+ -3.697946344563531 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:1 20:1 21:1
Binary file
@@ -0,0 +1,16 @@
1
+ svm_type c_svc
2
+ kernel_type rbf
3
+ gamma 0.0454545
4
+ nr_class 2
5
+ total_sv 7
6
+ rho -0.00785397
7
+ label 0 1
8
+ nr_sv 4 3
9
+ SV
10
+ 1.178526450216892 0:1 1:1 2:1 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
11
+ 0.8670862694825178 0:1 1:0 2:0 3:1 4:1 5:1 6:1 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
12
+ 1.565208229773764 0:0 1:1 2:0 3:1 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:1 13:1 14:1 15:0 16:0 17:0 18:0 19:0 20:0 21:0
13
+ 3.06895439364994 0:1 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:1 16:0 17:0 18:0 19:0 20:0 21:0
14
+ -2.445428417806817 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:1 10:1 11:1 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:0 20:0 21:0
15
+ -0.5364005807527658 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:1 10:1 11:1 12:0 13:0 14:0 15:0 16:1 17:1 18:1 19:0 20:0 21:0
16
+ -3.697946344563531 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:0 9:0 10:0 11:0 12:0 13:0 14:0 15:0 16:0 17:0 18:0 19:1 20:1 21:1
Binary file
@@ -0,0 +1,74 @@
1
+ require 'helper'
2
+
3
+ class TestHoatzin < Test::Unit::TestCase
4
+
5
+ context "An untrained Hoatzin classifier" do
6
+
7
+ setup do
8
+ @c = Hoatzin::Classifier.new()
9
+ end
10
+
11
+ should "support training and classification" do
12
+ assert_equal @c.train(:positive, "Thats nice"), [[1, 1]]
13
+ assert_equal @c.classify("Thats nice"), :positive
14
+ end
15
+
16
+ context "that has been trained" do
17
+
18
+ setup do
19
+ TRAINING_LABELS.each_with_index do |label, index|
20
+ @c.train(label, TRAINING_DOCS[index])
21
+ end
22
+ end
23
+
24
+ should "classify the test set correctly" do
25
+ #@c.save(:metadata => METADATA_FILE, :model => MODEL_FILE, :update => true)
26
+ TESTING_LABELS.each_with_index do |label, index|
27
+ assert_equal @c.classify(TESTING_DOCS[index]), label
28
+ end
29
+ end
30
+
31
+ should "return the classifications" do
32
+ assert_equal @c.classifications, [1,0]
33
+ end
34
+ end
35
+
36
+ end
37
+
38
+ context "An untrained Hoatzin classifier with an un-updatable model" do
39
+
40
+ setup do
41
+ @c = Hoatzin::Classifier.new(:metadata => READONLY_METADATA_FILE, :model => READONLY_MODEL_FILE )
42
+ end
43
+
44
+ should "classify the test set correctly" do
45
+ TESTING_LABELS.each_with_index do |label, index|
46
+ assert_equal @c.classify(TESTING_DOCS[index]), label
47
+ end
48
+ end
49
+
50
+ should "not allow further training" do
51
+ #@c.train(:positive, "Thats nice")
52
+ assert_raises(Hoatzin::Classifier::ReadOnly) { @c.train(:positive, "Thats nice") }
53
+ end
54
+
55
+ end
56
+
57
+ context "An untrained Hoatzin classifier with an updatable model" do
58
+
59
+ setup do
60
+ @c = Hoatzin::Classifier.new(:metadata => METADATA_FILE, :model => MODEL_FILE )
61
+ end
62
+
63
+ should "classify the test set correctly" do
64
+ TESTING_LABELS.each_with_index do |label, index|
65
+ assert_equal @c.classify(TESTING_DOCS[index]), label
66
+ end
67
+ end
68
+
69
+ should "allow further training" do
70
+ assert_nothing_raised {@c.train :positive, "Thats nice" }
71
+ end
72
+
73
+ end
74
+ end
metadata ADDED
@@ -0,0 +1,161 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hoatzin
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - robl
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-12-31 00:00:00 +00:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: libsvm-ruby-swig
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :runtime
31
+ prerelease: false
32
+ version_requirements: *id001
33
+ - !ruby/object:Gem::Dependency
34
+ name: fast-stemmer
35
+ requirement: &id002 !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ segments:
41
+ - 0
42
+ version: "0"
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: *id002
46
+ - !ruby/object:Gem::Dependency
47
+ name: shoulda
48
+ requirement: &id003 !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ segments:
54
+ - 0
55
+ version: "0"
56
+ type: :development
57
+ prerelease: false
58
+ version_requirements: *id003
59
+ - !ruby/object:Gem::Dependency
60
+ name: bundler
61
+ requirement: &id004 !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ~>
65
+ - !ruby/object:Gem::Version
66
+ segments:
67
+ - 1
68
+ - 0
69
+ - 0
70
+ version: 1.0.0
71
+ type: :development
72
+ prerelease: false
73
+ version_requirements: *id004
74
+ - !ruby/object:Gem::Dependency
75
+ name: jeweler
76
+ requirement: &id005 !ruby/object:Gem::Requirement
77
+ none: false
78
+ requirements:
79
+ - - ~>
80
+ - !ruby/object:Gem::Version
81
+ segments:
82
+ - 1
83
+ - 5
84
+ - 2
85
+ version: 1.5.2
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: *id005
89
+ - !ruby/object:Gem::Dependency
90
+ name: rcov
91
+ requirement: &id006 !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ segments:
97
+ - 0
98
+ version: "0"
99
+ type: :development
100
+ prerelease: false
101
+ version_requirements: *id006
102
+ description: Hoatzin is a text classifier in Ruby that uses SVM for it's classification.
103
+ email: robl@rjlee.net
104
+ executables: []
105
+
106
+ extensions: []
107
+
108
+ extra_rdoc_files:
109
+ - LICENSE.txt
110
+ - README.markdown
111
+ files:
112
+ - Gemfile
113
+ - Gemfile.lock
114
+ - LICENSE.txt
115
+ - README.markdown
116
+ - Rakefile
117
+ - VERSION
118
+ - hoatzin.gemspec
119
+ - lib/hoatzin.rb
120
+ - test/helper.rb
121
+ - test/models/readonly-test/metadata
122
+ - test/models/readonly-test/model
123
+ - test/models/test/metadata
124
+ - test/models/test/model
125
+ - test/test_hoatzin.rb
126
+ has_rdoc: true
127
+ homepage: http://github.com/rjlee/hoatzin
128
+ licenses:
129
+ - MIT
130
+ post_install_message:
131
+ rdoc_options: []
132
+
133
+ require_paths:
134
+ - lib
135
+ required_ruby_version: !ruby/object:Gem::Requirement
136
+ none: false
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ hash: -1045663747
141
+ segments:
142
+ - 0
143
+ version: "0"
144
+ required_rubygems_version: !ruby/object:Gem::Requirement
145
+ none: false
146
+ requirements:
147
+ - - ">="
148
+ - !ruby/object:Gem::Version
149
+ segments:
150
+ - 0
151
+ version: "0"
152
+ requirements: []
153
+
154
+ rubyforge_project:
155
+ rubygems_version: 1.3.7
156
+ signing_key:
157
+ specification_version: 3
158
+ summary: SVM Classifier in Ruby
159
+ test_files:
160
+ - test/helper.rb
161
+ - test/test_hoatzin.rb