hoatzin 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +6 -0
- data/README.markdown +38 -4
- data/VERSION +1 -1
- data/hoatzin.gemspec +6 -2
- data/lib/classifier.rb +115 -0
- data/lib/hoatzin.rb +5 -185
- data/lib/parser.rb +82 -0
- data/lib/vector_space/builder.rb +81 -0
- data/lib/vector_space/model.rb +46 -0
- data/test/models/readonly-test/metadata +9 -9
- data/test/models/readonly-test/model +0 -0
- data/test/models/test/metadata +9 -9
- data/test/models/test/model +0 -0
- data/test/test_hoatzin.rb +3 -2
- metadata +34 -22
data/LICENSE.txt
CHANGED
@@ -18,3 +18,9 @@ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
18
18
|
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
19
|
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
20
|
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
21
|
+
|
22
|
+
Portions of this software are licensed under the MIT License, as specified in
|
23
|
+
their original form (from https://github.com/josephwilk/rsemantic) :
|
24
|
+
|
25
|
+
vector_space/builder.rb
|
26
|
+
vector_space/model.rb
|
data/README.markdown
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# hoatzin
|
2
2
|
|
3
|
-
Hoatzin is a text classifier in Ruby that uses
|
3
|
+
Hoatzin is a text classifier in Ruby that uses libsvm for it's classification.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -19,8 +19,8 @@ gem install hoatzin
|
|
19
19
|
|
20
20
|
## Storage
|
21
21
|
|
22
|
-
The Hoatzin classifier supports saving your trained classifier to the filesystem. It
|
23
|
-
|
22
|
+
The Hoatzin classifier supports saving your trained classifier to the filesystem. It stores
|
23
|
+
the generated libsvm model and the required metadata as two separate files.
|
24
24
|
|
25
25
|
# Load a previously trained classifier
|
26
26
|
c = Hoatzin::Classifier.new(:metadata => '/path/to/file', :model => '/path/to/file')
|
@@ -36,11 +36,45 @@ to store the libsvm model and the associated metadata as two separate files.
|
|
36
36
|
The classifier can continue to be trained if the model is saved with the :update => true option,
|
37
37
|
however the files stored on the filesystem will be much larger as they will contain copies
|
38
38
|
of all the documents used during training the classifier. It is generally advised to save without
|
39
|
-
the :update => true option unless it is
|
39
|
+
the :update => true option unless it is required.
|
40
|
+
|
41
|
+
## Training
|
42
|
+
|
43
|
+
The #train method doesn't calculate all the required information for classification
|
44
|
+
(in particular the feature vectors) due to the time they take to recompute for each new
|
45
|
+
token generated when adding a document for training. This means that there can be a delay
|
46
|
+
when calling the #classify method for the first time whilst all the required information
|
47
|
+
is prepared. This preparation step can be explicitly called using the #sync method. This
|
48
|
+
method is transparently called by the #classify method when required. Sample usage of the #sync
|
49
|
+
method is shown below:
|
50
|
+
|
51
|
+
# Create a hoatzin classifier
|
52
|
+
c = Hoatzin::Classifier.new()
|
53
|
+
|
54
|
+
# Add the training data to the classifier
|
55
|
+
corpus.each do |doc|
|
56
|
+
c.train(doc[:classification], doc[:text])
|
57
|
+
end
|
58
|
+
|
59
|
+
# Force the calculation of the feature vectors and
|
60
|
+
# preparation of the SVM model. This can take some
|
61
|
+
# time if the corpus is large
|
62
|
+
c.sync
|
63
|
+
|
64
|
+
# Save the model and associated meta-data so we don't have to
|
65
|
+
# call sync again and wait for the feature vectors to be computed
|
66
|
+
c.save(:metadata => '/path/to/metadata', :model => '/path/to/model')
|
67
|
+
|
68
|
+
# Now call classify
|
69
|
+
c.classify("Spectacular show")
|
70
|
+
|
71
|
+
The saved model and metadata can be loaded again for classification, avoiding the
|
72
|
+
need to recompute the feature vectors.
|
40
73
|
|
41
74
|
## Acknowledgements
|
42
75
|
|
43
76
|
See http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/ for the original inspiration.
|
77
|
+
The Vector Space Model implementation is adapted from https://github.com/josephwilk/rsemantic
|
44
78
|
|
45
79
|
## Copyright and License
|
46
80
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/hoatzin.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{hoatzin}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["robl"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2011-01-03}
|
13
13
|
s.description = %q{Hoatzin is a text classifier in Ruby that uses SVM for it's classification.}
|
14
14
|
s.email = %q{robl@rjlee.net}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -24,7 +24,11 @@ Gem::Specification.new do |s|
|
|
24
24
|
"Rakefile",
|
25
25
|
"VERSION",
|
26
26
|
"hoatzin.gemspec",
|
27
|
+
"lib/classifier.rb",
|
27
28
|
"lib/hoatzin.rb",
|
29
|
+
"lib/parser.rb",
|
30
|
+
"lib/vector_space/builder.rb",
|
31
|
+
"lib/vector_space/model.rb",
|
28
32
|
"test/helper.rb",
|
29
33
|
"test/models/readonly-test/metadata",
|
30
34
|
"test/models/readonly-test/model",
|
data/lib/classifier.rb
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
module Hoatzin
|
2
|
+
class Classifier
|
3
|
+
|
4
|
+
class ReadOnly < Exception; end
|
5
|
+
class InvalidFormat < Exception; end
|
6
|
+
|
7
|
+
FORMAT_VERSION = 2
|
8
|
+
|
9
|
+
attr_reader :classifications
|
10
|
+
|
11
|
+
def initialize options = {}
|
12
|
+
|
13
|
+
@documents = []
|
14
|
+
@classifications = []
|
15
|
+
@labels = []
|
16
|
+
|
17
|
+
@problem = @model = nil
|
18
|
+
@cache = 0
|
19
|
+
@readonly = false
|
20
|
+
|
21
|
+
@metadata_file = options.delete(:metadata) || nil
|
22
|
+
@model_file = options.delete(:model) || nil
|
23
|
+
|
24
|
+
@builder = VectorSpace::Builder.new(:parser => Hoatzin::Parser.new)
|
25
|
+
|
26
|
+
# If we have model and metadata files then load them
|
27
|
+
load if @metadata_file && @model_file
|
28
|
+
|
29
|
+
|
30
|
+
# Define kernel parameters for libsvm
|
31
|
+
@parameters = Parameter.new(:C => 100,
|
32
|
+
:degree => 1,
|
33
|
+
:coef0 => 0,
|
34
|
+
:eps => 0.001)
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
def train classification, text
|
39
|
+
# Only allow retraining if we have all the required data
|
40
|
+
raise ReadOnly if @readonly
|
41
|
+
|
42
|
+
# Add the classification if we haven't seen it before
|
43
|
+
@classifications << classification unless @classifications.include?(classification)
|
44
|
+
|
45
|
+
# Add to document corpus
|
46
|
+
@documents << text
|
47
|
+
|
48
|
+
# Add classification to classification list
|
49
|
+
@labels << @classifications.index(classification)
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
def classify text
|
54
|
+
|
55
|
+
# See if we need to calculate the feature vectors
|
56
|
+
sync
|
57
|
+
|
58
|
+
# Calculate the feature vectors for the text to be classified
|
59
|
+
f_vector = @builder.build_query_vector(text)
|
60
|
+
|
61
|
+
# Classify and return classification
|
62
|
+
pred, probs = @model.predict_probability(f_vector)
|
63
|
+
@classifications[pred.to_i]
|
64
|
+
end
|
65
|
+
|
66
|
+
def sync
|
67
|
+
# Only update the model if we've trained more documents since it was last updated
|
68
|
+
if !@readonly && @documents.length > @cache
|
69
|
+
return nil if @documents.length == 0
|
70
|
+
@cache = @documents.length
|
71
|
+
assign_model
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def save options = {}
|
76
|
+
@metadata_file = options[:metadata] if options.key?(:metadata)
|
77
|
+
@model_file = options[:model] if options.key?(:model)
|
78
|
+
return false unless (@metadata_file && @model_file)
|
79
|
+
|
80
|
+
# TODO: Add a version identifier
|
81
|
+
data = { :classifications => @classifications,
|
82
|
+
:version => FORMAT_VERSION,
|
83
|
+
:dictionary => @builder.vector_keyword_index,
|
84
|
+
:readonly => true }
|
85
|
+
data.merge!(:documents => @documents,
|
86
|
+
:cache => @cache,
|
87
|
+
:readonly => false) if options[:update]
|
88
|
+
File.open(@metadata_file, 'w+') { |f| Marshal.dump(data, f) }
|
89
|
+
assign_model if @model.nil?
|
90
|
+
@model.save(@model_file)
|
91
|
+
end
|
92
|
+
|
93
|
+
protected
|
94
|
+
def load
|
95
|
+
data = {}
|
96
|
+
File.open(@metadata_file) { |f| data = Marshal.load(f) }
|
97
|
+
raise InvalidFormat if !data.key?(:version) || data[:version] != FORMAT_VERSION
|
98
|
+
@classifications = data[:classifications]
|
99
|
+
@readonly = data[:readonly]
|
100
|
+
@builder.vector_keyword_index = data[:dictionary]
|
101
|
+
unless @readonly
|
102
|
+
@documents = data[:documents]
|
103
|
+
@cache = data[:cache]
|
104
|
+
end
|
105
|
+
@model = Model.new(@model_file)
|
106
|
+
end
|
107
|
+
|
108
|
+
def assign_model
|
109
|
+
vector_space_model = @builder.build_document_matrix(@documents)
|
110
|
+
@problem = Problem.new(@labels, vector_space_model.matrix)
|
111
|
+
@model = Model.new(@problem, @parameters)
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
end
|
data/lib/hoatzin.rb
CHANGED
@@ -1,189 +1,9 @@
|
|
1
1
|
require 'svm'
|
2
2
|
require 'fast_stemmer'
|
3
3
|
require 'iconv'
|
4
|
+
require 'pp'
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
attr_reader :classifications
|
11
|
-
|
12
|
-
def initialize options = {}
|
13
|
-
|
14
|
-
@metadata_file = options.delete(:metadata) || nil
|
15
|
-
@model_file = options.delete(:model) || nil
|
16
|
-
@documents = []
|
17
|
-
@dictionary = []
|
18
|
-
@classifications = []
|
19
|
-
@labels = []
|
20
|
-
@feature_vectors = []
|
21
|
-
@problem = @model = nil
|
22
|
-
@cache = 0
|
23
|
-
@readonly = false
|
24
|
-
|
25
|
-
# If we have model and metadata files then load them
|
26
|
-
load if @metadata_file && @model_file
|
27
|
-
|
28
|
-
# Define kernel parameters for libsvm
|
29
|
-
@parameters = Parameter.new(:C => 100,
|
30
|
-
:degree => 1,
|
31
|
-
:coef0 => 0,
|
32
|
-
:eps => 0.001)
|
33
|
-
|
34
|
-
end
|
35
|
-
|
36
|
-
def train classification, text
|
37
|
-
# Only allow retraining if we have all the required data
|
38
|
-
raise ReadOnly if @readonly
|
39
|
-
|
40
|
-
# Add the classification if we haven't seen it before
|
41
|
-
@classifications << classification unless @classifications.include?(classification)
|
42
|
-
|
43
|
-
# Tokenize the text
|
44
|
-
tokens = Classifier.tokenize(text)
|
45
|
-
|
46
|
-
# Add tokens to word list
|
47
|
-
@dictionary << tokens
|
48
|
-
@dictionary.flatten!.uniq!
|
49
|
-
|
50
|
-
# Add to list of documents
|
51
|
-
@documents << tokens
|
52
|
-
|
53
|
-
# Add classification to classification list
|
54
|
-
@labels << @classifications.index(classification)
|
55
|
-
|
56
|
-
# Compute the feature vectors
|
57
|
-
@feature_vectors = @documents.map { |doc| @dictionary.map{|x| doc.include?(x) ? 1 : 0} }
|
58
|
-
end
|
59
|
-
|
60
|
-
def classify text
|
61
|
-
# Only update the model if we've trained more documents since it was last updated
|
62
|
-
if !@readonly && @documents.length > @cache
|
63
|
-
return nil if @documents.length == 0
|
64
|
-
@cache = @documents.length
|
65
|
-
assign_model
|
66
|
-
end
|
67
|
-
|
68
|
-
# Tokenize the text
|
69
|
-
tokens = Classifier.tokenize(text)
|
70
|
-
|
71
|
-
# Calculate the feature vectors for the text to be classified
|
72
|
-
f_vector = @dictionary.map{|x| tokens.include?(x) ? 1 : 0}
|
73
|
-
|
74
|
-
# Classify and return classification
|
75
|
-
pred, probs = @model.predict_probability(f_vector)
|
76
|
-
@classifications[pred.to_i]
|
77
|
-
end
|
78
|
-
|
79
|
-
def save options = {}
|
80
|
-
@metadata_file = options[:metadata] if options.key?(:metadata)
|
81
|
-
@model_file = options[:model] if options.key?(:model)
|
82
|
-
return false unless (@metadata_file && @model_file)
|
83
|
-
data = { :dictionary => @dictionary, :classifications => @classifications}
|
84
|
-
data.merge!(:documents => @documents,
|
85
|
-
:labels => @labels,
|
86
|
-
:feature_vectors => @feature_vectors,
|
87
|
-
:cache => @cache) if options[:update]
|
88
|
-
File.open(@metadata_file, 'w+') { |f| Marshal.dump(data, f) }
|
89
|
-
assign_model if @model.nil?
|
90
|
-
@model.save(@model_file)
|
91
|
-
end
|
92
|
-
|
93
|
-
protected
|
94
|
-
def load
|
95
|
-
data = {}
|
96
|
-
File.open(@metadata_file) { |f| data = Marshal.load(f) }
|
97
|
-
@dictionary = data[:dictionary]
|
98
|
-
@classifications = data[:classifications]
|
99
|
-
if data.key?(:documents)
|
100
|
-
@documents = data[:documents]
|
101
|
-
@labels = data[:labels]
|
102
|
-
@feature_vectors = data[:feature_vectors]
|
103
|
-
@cache = data[:cache]
|
104
|
-
end
|
105
|
-
@readonly = @documents.length > 0 ? false : true
|
106
|
-
@model = Model.new(@model_file)
|
107
|
-
end
|
108
|
-
|
109
|
-
def assign_model
|
110
|
-
@problem = Problem.new(@labels, @feature_vectors)
|
111
|
-
@model = Model.new(@problem, @parameters)
|
112
|
-
end
|
113
|
-
|
114
|
-
# Adapted from ankusa, to replace with tokenizer gem
|
115
|
-
def self.tokenize text
|
116
|
-
tokens = []
|
117
|
-
# from http://www.jroller.com/obie/tags/unicode
|
118
|
-
converter = Iconv.new('ASCII//IGNORE//TRANSLIT', 'UTF-8')
|
119
|
-
converter.iconv(text).unpack('U*').select { |cp| cp < 127 }.pack('U*') rescue ""
|
120
|
-
text.tr('-', ' ').gsub(/[^\w\s]/," ").split.each do |token|
|
121
|
-
tokens << token if (token.length > 3 && !Classifier.stop_words.include?(token))
|
122
|
-
end
|
123
|
-
tokens
|
124
|
-
end
|
125
|
-
|
126
|
-
# ftp://ftp.cs.cornell.edu/pub/smart/english.stop
|
127
|
-
def self.stop_words
|
128
|
-
%w{
|
129
|
-
a a's able about above according accordingly across actually after
|
130
|
-
afterwards again against ain't all allow allows almost alone along
|
131
|
-
already also although always am among amongst an and another any
|
132
|
-
anybody anyhow anyone anything anyway anyways anywhere apart appear
|
133
|
-
appreciate appropriate are aren't around as aside ask asking
|
134
|
-
associated at available away awfully b be became because become
|
135
|
-
becomes becoming been before beforehand behind being believe below
|
136
|
-
beside besides best better between beyond both brief but by c
|
137
|
-
c'mon c's came can can't cannot cant cause causes certain certainly
|
138
|
-
changes clearly co com come comes concerning consequently consider
|
139
|
-
considering contain containing contains corresponding could couldn't
|
140
|
-
course currently d definitely described despite did didn't different
|
141
|
-
do does doesn't doing don't done down downwards during e each edu
|
142
|
-
eg eight either else elsewhere enough entirely especially et etc
|
143
|
-
even ever every everybody everyone everything everywhere ex exactly
|
144
|
-
example except f far few fifth first five followed following follows
|
145
|
-
for former formerly forth four from further furthermore g get gets
|
146
|
-
getting given gives go goes going gone got gotten greetings h had
|
147
|
-
hadn't happens hardly has hasn't have haven't having he he's hello
|
148
|
-
help hence her here here's hereafter hereby herein hereupon hers
|
149
|
-
herself hi him himself his hither hopefully how howbeit however i
|
150
|
-
i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed
|
151
|
-
indicate indicated indicates inner insofar instead into inward is
|
152
|
-
isn't it it'd it'll it's its itself j just k keep keeps kept know
|
153
|
-
knows known l last lately later latter latterly least less lest let
|
154
|
-
let's like liked likely little look looking looks ltd m mainly many
|
155
|
-
may maybe me mean meanwhile merely might more moreover most mostly
|
156
|
-
much must my myself n name namely nd near nearly necessary need needs
|
157
|
-
neither never nevertheless new next nine no nobody non none noone
|
158
|
-
nor normally not nothing novel now nowhere o obviously of off often
|
159
|
-
oh ok okay old on once one ones only onto or other others otherwise
|
160
|
-
ought our ours ourselves out outside over overall own p particular
|
161
|
-
particularly per perhaps placed please plus possible presumably
|
162
|
-
probably provides q que quite qv r rather rd re really reasonably
|
163
|
-
regarding regardless regards relatively respectively right s said
|
164
|
-
same saw say saying says second secondly see seeing seem seemed
|
165
|
-
seeming seems seen self selves sensible sent serious seriously
|
166
|
-
seven several shall she should shouldn't since six so some somebody
|
167
|
-
somehow someone something sometime sometimes somewhat somewhere soon
|
168
|
-
sorry specified specify specifying still sub such sup sure t t's
|
169
|
-
take taken tell tends th than thank thanks thanx that that's thats
|
170
|
-
the their theirs them themselves then thence there there's thereafter
|
171
|
-
thereby therefore therein theres thereupon these they they'd they'll
|
172
|
-
they're they've think third this thorough thoroughly those though
|
173
|
-
three through throughout thru thus to together too took toward
|
174
|
-
towards tried tries truly try trying twice two u un under
|
175
|
-
unfortunately unless unlikely until unto up upon us use used useful
|
176
|
-
uses using usually uucp v value various very via viz vs w want wants
|
177
|
-
was wasn't way we we'd we'll we're we've welcome well went were weren't
|
178
|
-
what what's whatever when whence whenever where where's whereafter
|
179
|
-
whereas whereby wherein whereupon wherever whether which while
|
180
|
-
whither who who's whoever whole whom whose why will willing wish
|
181
|
-
with within without won't wonder would would wouldn't x y yes yet
|
182
|
-
you you'd you'll you're you've your yours yourself yourselves
|
183
|
-
z zero
|
184
|
-
}
|
185
|
-
end
|
186
|
-
|
187
|
-
|
188
|
-
end
|
189
|
-
end
|
6
|
+
require 'classifier'
|
7
|
+
require 'parser'
|
8
|
+
require 'vector_space/model'
|
9
|
+
require 'vector_space/builder'
|
data/lib/parser.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
module Hoatzin
|
2
|
+
class Parser
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
end
|
6
|
+
|
7
|
+
# Adapted from ankusa, to replace with tokenizer gem
|
8
|
+
def tokenize text
|
9
|
+
tokens = []
|
10
|
+
# from http://www.jroller.com/obie/tags/unicode
|
11
|
+
converter = Iconv.new('ASCII//IGNORE//TRANSLIT', 'UTF-8')
|
12
|
+
converter.iconv(text).unpack('U*').select { |cp| cp < 127 }.pack('U*') rescue ""
|
13
|
+
text.tr('-', ' ').gsub(/[^\w\s]/," ").split.each do |token|
|
14
|
+
token = token.stem
|
15
|
+
tokens << token if (token.length > 3 && !stop_words.include?(token))
|
16
|
+
end
|
17
|
+
tokens
|
18
|
+
end
|
19
|
+
|
20
|
+
# ftp://ftp.cs.cornell.edu/pub/smart/english.stop
|
21
|
+
def stop_words
|
22
|
+
%w{
|
23
|
+
a a's able about above according accordingly across actually after
|
24
|
+
afterwards again against ain't all allow allows almost alone along
|
25
|
+
already also although always am among amongst an and another any
|
26
|
+
anybody anyhow anyone anything anyway anyways anywhere apart appear
|
27
|
+
appreciate appropriate are aren't around as aside ask asking
|
28
|
+
associated at available away awfully b be became because become
|
29
|
+
becomes becoming been before beforehand behind being believe below
|
30
|
+
beside besides best better between beyond both brief but by c
|
31
|
+
c'mon c's came can can't cannot cant cause causes certain certainly
|
32
|
+
changes clearly co com come comes concerning consequently consider
|
33
|
+
considering contain containing contains corresponding could couldn't
|
34
|
+
course currently d definitely described despite did didn't different
|
35
|
+
do does doesn't doing don't done down downwards during e each edu
|
36
|
+
eg eight either else elsewhere enough entirely especially et etc
|
37
|
+
even ever every everybody everyone everything everywhere ex exactly
|
38
|
+
example except f far few fifth first five followed following follows
|
39
|
+
for former formerly forth four from further furthermore g get gets
|
40
|
+
getting given gives go goes going gone got gotten greetings h had
|
41
|
+
hadn't happens hardly has hasn't have haven't having he he's hello
|
42
|
+
help hence her here here's hereafter hereby herein hereupon hers
|
43
|
+
herself hi him himself his hither hopefully how howbeit however i
|
44
|
+
i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed
|
45
|
+
indicate indicated indicates inner insofar instead into inward is
|
46
|
+
isn't it it'd it'll it's its itself j just k keep keeps kept know
|
47
|
+
knows known l last lately later latter latterly least less lest let
|
48
|
+
let's like liked likely little look looking looks ltd m mainly many
|
49
|
+
may maybe me mean meanwhile merely might more moreover most mostly
|
50
|
+
much must my myself n name namely nd near nearly necessary need needs
|
51
|
+
neither never nevertheless new next nine no nobody non none noone
|
52
|
+
nor normally not nothing novel now nowhere o obviously of off often
|
53
|
+
oh ok okay old on once one ones only onto or other others otherwise
|
54
|
+
ought our ours ourselves out outside over overall own p particular
|
55
|
+
particularly per perhaps placed please plus possible presumably
|
56
|
+
probably provides q que quite qv r rather rd re really reasonably
|
57
|
+
regarding regardless regards relatively respectively right s said
|
58
|
+
same saw say saying says second secondly see seeing seem seemed
|
59
|
+
seeming seems seen self selves sensible sent serious seriously
|
60
|
+
seven several shall she should shouldn't since six so some somebody
|
61
|
+
somehow someone something sometime sometimes somewhat somewhere soon
|
62
|
+
sorry specified specify specifying still sub such sup sure t t's
|
63
|
+
take taken tell tends th than thank thanks thanx that that's thats
|
64
|
+
the their theirs them themselves then thence there there's thereafter
|
65
|
+
thereby therefore therein theres thereupon these they they'd they'll
|
66
|
+
they're they've think third this thorough thoroughly those though
|
67
|
+
three through throughout thru thus to together too took toward
|
68
|
+
towards tried tries truly try trying twice two u un under
|
69
|
+
unfortunately unless unlikely until unto up upon us use used useful
|
70
|
+
uses using usually uucp v value various very via viz vs w want wants
|
71
|
+
was wasn't way we we'd we'll we're we've welcome well went were weren't
|
72
|
+
what what's whatever when whence whenever where where's whereafter
|
73
|
+
whereas whereby wherein whereupon wherever whether which while
|
74
|
+
whither who who's whoever whole whom whose why will willing wish
|
75
|
+
with within without won't wonder would would wouldn't x y yes yet
|
76
|
+
you you'd you'll you're you've your yours yourself yourselves
|
77
|
+
z zero
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# Adapted from : https://github.com/josephwilk/rsemantic
|
2
|
+
|
3
|
+
module Hoatzin
|
4
|
+
module VectorSpace
|
5
|
+
#A algebraic model for representing text documents as vectors of identifiers.
|
6
|
+
#A document is represented as a vector. Each dimension of the vector corresponds to a
|
7
|
+
#separate term. If a term occurs in the document, then the value in the vector is non-zero.
|
8
|
+
class Builder
|
9
|
+
|
10
|
+
attr_accessor :vector_keyword_index
|
11
|
+
|
12
|
+
def initialize(options={})
|
13
|
+
@parser = options.delete(:parser)
|
14
|
+
@options = options
|
15
|
+
@parsed_document_cache = []
|
16
|
+
end
|
17
|
+
|
18
|
+
def build_document_matrix(documents)
|
19
|
+
@vector_keyword_index = build_vector_keyword_index(documents)
|
20
|
+
|
21
|
+
document_matrix = []
|
22
|
+
document_matrix += documents.enum_for(:each_with_index).map{|document,document_id| build_vector(document, document_id)}
|
23
|
+
|
24
|
+
Model.new(document_matrix, @vector_keyword_index)
|
25
|
+
end
|
26
|
+
|
27
|
+
def build_query_vector(text)
|
28
|
+
build_vector(text)
|
29
|
+
end
|
30
|
+
|
31
|
+
def marshal_dump
|
32
|
+
[@parser, @options, @parsed_document_cache, @vector_keyword_index]
|
33
|
+
end
|
34
|
+
|
35
|
+
def marshal_load(ary)
|
36
|
+
@parser, @options, @parsed_document_cache, @vector_keyword_index = ary
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
def build_vector_keyword_index(documents)
|
41
|
+
parse_and_cache(documents)
|
42
|
+
vocabulary_list = find_unique_vocabulary
|
43
|
+
map_vocabulary_to_vector_positions(vocabulary_list)
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse_and_cache(documents)
|
47
|
+
documents.each_with_index do |document, index|
|
48
|
+
@parsed_document_cache[index] = @parser.tokenize(document)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def find_unique_vocabulary
|
53
|
+
vocabulary_list = @parsed_document_cache.inject([]) { |parsed_document, vocabulary_list| vocabulary_list + parsed_document }
|
54
|
+
vocabulary_list.uniq
|
55
|
+
end
|
56
|
+
|
57
|
+
def map_vocabulary_to_vector_positions(vocabulary_list)
|
58
|
+
vector_index={}
|
59
|
+
column = 0
|
60
|
+
vocabulary_list.each do |word|
|
61
|
+
vector_index[word] = column
|
62
|
+
column += 1
|
63
|
+
end
|
64
|
+
vector_index
|
65
|
+
end
|
66
|
+
|
67
|
+
def build_vector(word_string, document_id=nil)
|
68
|
+
if document_id.nil?
|
69
|
+
word_list = @parser.tokenize(word_string)
|
70
|
+
else
|
71
|
+
word_list = @parsed_document_cache[document_id]
|
72
|
+
end
|
73
|
+
|
74
|
+
vector = Array.new(@vector_keyword_index.length, 0)
|
75
|
+
word_list.each { |word| vector[@vector_keyword_index[word]] += 1 if @vector_keyword_index.has_key?(word) }
|
76
|
+
vector
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# Adapted from : https://github.com/josephwilk/rsemantic
|
2
|
+
|
3
|
+
#require 'stringio'
|
4
|
+
|
5
|
+
module Hoatzin
|
6
|
+
module VectorSpace
|
7
|
+
|
8
|
+
class Model
|
9
|
+
|
10
|
+
def initialize(matrix, keywords)
|
11
|
+
@keywords = keywords || {}
|
12
|
+
@_dc_obj = matrix
|
13
|
+
end
|
14
|
+
|
15
|
+
def matrix=(matrix)
|
16
|
+
@_dc_obj = matrix
|
17
|
+
end
|
18
|
+
|
19
|
+
def matrix
|
20
|
+
@_dc_obj
|
21
|
+
end
|
22
|
+
|
23
|
+
# def to_s
|
24
|
+
# out = StringIO.new
|
25
|
+
# out.print " " * 9
|
26
|
+
#
|
27
|
+
# matrix.ncol.times do |id|
|
28
|
+
# out.print " D#{id+1} "
|
29
|
+
# end
|
30
|
+
# out.puts
|
31
|
+
#
|
32
|
+
# matrix.rows.each_with_index do |terms, index|
|
33
|
+
# out.print "#{@keywords.index(index).ljust(6)}" if @keywords.has_value?(index)
|
34
|
+
# out.print "[ "
|
35
|
+
# terms.columns.each do |document|
|
36
|
+
# out.print "%+0.2f " % document
|
37
|
+
# end
|
38
|
+
# out.print "]"
|
39
|
+
# out.puts
|
40
|
+
# end
|
41
|
+
# out.string
|
42
|
+
# end
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -1,16 +1,16 @@
|
|
1
1
|
svm_type c_svc
|
2
2
|
kernel_type rbf
|
3
|
-
gamma 0.
|
3
|
+
gamma 0.0833333
|
4
4
|
nr_class 2
|
5
5
|
total_sv 7
|
6
|
-
rho -0.
|
6
|
+
rho -0.44634
|
7
7
|
label 0 1
|
8
8
|
nr_sv 4 3
|
9
9
|
SV
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
-
|
15
|
-
-0.
|
16
|
-
-
|
10
|
+
0.09624983775649808 0:0 1:0 2:0 3:0 4:0 5:1 6:0 7:0 8:1 9:0 10:0 11:1
|
11
|
+
1.421056914116459 0:0 1:0 2:0 3:0 4:0 5:1 6:0 7:0 8:0 9:1 10:1 11:0
|
12
|
+
3.139840844768948 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:0 10:0 11:0
|
13
|
+
2.541431764584626 0:0 1:0 2:0 3:0 4:0 5:1 6:1 7:0 8:0 9:0 10:0 11:0
|
14
|
+
-0.1202843540395405 0:1 1:0 2:0 3:1 4:1 5:0 6:0 7:0 8:0 9:0 10:0 11:0
|
15
|
+
-0.7832988164395933 0:1 1:1 2:1 3:1 4:1 5:0 6:0 7:0 8:0 9:0 10:0 11:0
|
16
|
+
-6.294996190747396 0:1 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0
|
Binary file
|
data/test/models/test/metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
svm_type c_svc
|
2
2
|
kernel_type rbf
|
3
|
-
gamma 0.
|
3
|
+
gamma 0.0833333
|
4
4
|
nr_class 2
|
5
5
|
total_sv 7
|
6
|
-
rho -0.
|
6
|
+
rho -0.44634
|
7
7
|
label 0 1
|
8
8
|
nr_sv 4 3
|
9
9
|
SV
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
-
|
15
|
-
-0.
|
16
|
-
-
|
10
|
+
0.09624983775649808 0:0 1:0 2:0 3:0 4:0 5:1 6:0 7:0 8:1 9:0 10:0 11:1
|
11
|
+
1.421056914116459 0:0 1:0 2:0 3:0 4:0 5:1 6:0 7:0 8:0 9:1 10:1 11:0
|
12
|
+
3.139840844768948 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:0 10:0 11:0
|
13
|
+
2.541431764584626 0:0 1:0 2:0 3:0 4:0 5:1 6:1 7:0 8:0 9:0 10:0 11:0
|
14
|
+
-0.1202843540395405 0:1 1:0 2:0 3:1 4:1 5:0 6:0 7:0 8:0 9:0 10:0 11:0
|
15
|
+
-0.7832988164395933 0:1 1:1 2:1 3:1 4:1 5:0 6:0 7:0 8:0 9:0 10:0 11:0
|
16
|
+
-6.294996190747396 0:1 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0
|
data/test/models/test/model
CHANGED
Binary file
|
data/test/test_hoatzin.rb
CHANGED
@@ -9,7 +9,7 @@ class TestHoatzin < Test::Unit::TestCase
|
|
9
9
|
end
|
10
10
|
|
11
11
|
should "support training and classification" do
|
12
|
-
assert_equal @c.train(:positive, "Thats nice"), [[1, 1]]
|
12
|
+
assert_equal @c.train(:positive, "Thats nice"), [0] #[[1, 1]]
|
13
13
|
assert_equal @c.classify("Thats nice"), :positive
|
14
14
|
end
|
15
15
|
|
@@ -22,10 +22,10 @@ class TestHoatzin < Test::Unit::TestCase
|
|
22
22
|
end
|
23
23
|
|
24
24
|
should "classify the test set correctly" do
|
25
|
-
#@c.save(:metadata => METADATA_FILE, :model => MODEL_FILE, :update => true)
|
26
25
|
TESTING_LABELS.each_with_index do |label, index|
|
27
26
|
assert_equal @c.classify(TESTING_DOCS[index]), label
|
28
27
|
end
|
28
|
+
#@c.save(:metadata => READONLY_METADATA_FILE, :model => READONLY_MODEL_FILE, :update => false)
|
29
29
|
end
|
30
30
|
|
31
31
|
should "return the classifications" do
|
@@ -71,4 +71,5 @@ class TestHoatzin < Test::Unit::TestCase
|
|
71
71
|
end
|
72
72
|
|
73
73
|
end
|
74
|
+
|
74
75
|
end
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hoatzin
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
|
-
-
|
8
|
+
- 2
|
8
9
|
- 0
|
9
|
-
version: 0.
|
10
|
+
version: 0.2.0
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- robl
|
@@ -14,91 +15,97 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date:
|
18
|
+
date: 2011-01-03 00:00:00 +00:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
22
|
+
prerelease: false
|
21
23
|
name: libsvm-ruby-swig
|
22
|
-
|
24
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
23
25
|
none: false
|
24
26
|
requirements:
|
25
27
|
- - ">="
|
26
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
27
30
|
segments:
|
28
31
|
- 0
|
29
32
|
version: "0"
|
33
|
+
requirement: *id001
|
30
34
|
type: :runtime
|
31
|
-
prerelease: false
|
32
|
-
version_requirements: *id001
|
33
35
|
- !ruby/object:Gem::Dependency
|
36
|
+
prerelease: false
|
34
37
|
name: fast-stemmer
|
35
|
-
|
38
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
36
39
|
none: false
|
37
40
|
requirements:
|
38
41
|
- - ">="
|
39
42
|
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
40
44
|
segments:
|
41
45
|
- 0
|
42
46
|
version: "0"
|
47
|
+
requirement: *id002
|
43
48
|
type: :runtime
|
44
|
-
prerelease: false
|
45
|
-
version_requirements: *id002
|
46
49
|
- !ruby/object:Gem::Dependency
|
50
|
+
prerelease: false
|
47
51
|
name: shoulda
|
48
|
-
|
52
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
49
53
|
none: false
|
50
54
|
requirements:
|
51
55
|
- - ">="
|
52
56
|
- !ruby/object:Gem::Version
|
57
|
+
hash: 3
|
53
58
|
segments:
|
54
59
|
- 0
|
55
60
|
version: "0"
|
61
|
+
requirement: *id003
|
56
62
|
type: :development
|
57
|
-
prerelease: false
|
58
|
-
version_requirements: *id003
|
59
63
|
- !ruby/object:Gem::Dependency
|
64
|
+
prerelease: false
|
60
65
|
name: bundler
|
61
|
-
|
66
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
62
67
|
none: false
|
63
68
|
requirements:
|
64
69
|
- - ~>
|
65
70
|
- !ruby/object:Gem::Version
|
71
|
+
hash: 23
|
66
72
|
segments:
|
67
73
|
- 1
|
68
74
|
- 0
|
69
75
|
- 0
|
70
76
|
version: 1.0.0
|
77
|
+
requirement: *id004
|
71
78
|
type: :development
|
72
|
-
prerelease: false
|
73
|
-
version_requirements: *id004
|
74
79
|
- !ruby/object:Gem::Dependency
|
80
|
+
prerelease: false
|
75
81
|
name: jeweler
|
76
|
-
|
82
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
77
83
|
none: false
|
78
84
|
requirements:
|
79
85
|
- - ~>
|
80
86
|
- !ruby/object:Gem::Version
|
87
|
+
hash: 7
|
81
88
|
segments:
|
82
89
|
- 1
|
83
90
|
- 5
|
84
91
|
- 2
|
85
92
|
version: 1.5.2
|
93
|
+
requirement: *id005
|
86
94
|
type: :development
|
87
|
-
prerelease: false
|
88
|
-
version_requirements: *id005
|
89
95
|
- !ruby/object:Gem::Dependency
|
96
|
+
prerelease: false
|
90
97
|
name: rcov
|
91
|
-
|
98
|
+
version_requirements: &id006 !ruby/object:Gem::Requirement
|
92
99
|
none: false
|
93
100
|
requirements:
|
94
101
|
- - ">="
|
95
102
|
- !ruby/object:Gem::Version
|
103
|
+
hash: 3
|
96
104
|
segments:
|
97
105
|
- 0
|
98
106
|
version: "0"
|
107
|
+
requirement: *id006
|
99
108
|
type: :development
|
100
|
-
prerelease: false
|
101
|
-
version_requirements: *id006
|
102
109
|
description: Hoatzin is a text classifier in Ruby that uses SVM for it's classification.
|
103
110
|
email: robl@rjlee.net
|
104
111
|
executables: []
|
@@ -116,7 +123,11 @@ files:
|
|
116
123
|
- Rakefile
|
117
124
|
- VERSION
|
118
125
|
- hoatzin.gemspec
|
126
|
+
- lib/classifier.rb
|
119
127
|
- lib/hoatzin.rb
|
128
|
+
- lib/parser.rb
|
129
|
+
- lib/vector_space/builder.rb
|
130
|
+
- lib/vector_space/model.rb
|
120
131
|
- test/helper.rb
|
121
132
|
- test/models/readonly-test/metadata
|
122
133
|
- test/models/readonly-test/model
|
@@ -137,7 +148,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
137
148
|
requirements:
|
138
149
|
- - ">="
|
139
150
|
- !ruby/object:Gem::Version
|
140
|
-
hash:
|
151
|
+
hash: 3
|
141
152
|
segments:
|
142
153
|
- 0
|
143
154
|
version: "0"
|
@@ -146,6 +157,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
146
157
|
requirements:
|
147
158
|
- - ">="
|
148
159
|
- !ruby/object:Gem::Version
|
160
|
+
hash: 3
|
149
161
|
segments:
|
150
162
|
- 0
|
151
163
|
version: "0"
|