hoatzin 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +6 -0
- data/README.markdown +38 -4
- data/VERSION +1 -1
- data/hoatzin.gemspec +6 -2
- data/lib/classifier.rb +115 -0
- data/lib/hoatzin.rb +5 -185
- data/lib/parser.rb +82 -0
- data/lib/vector_space/builder.rb +81 -0
- data/lib/vector_space/model.rb +46 -0
- data/test/models/readonly-test/metadata +9 -9
- data/test/models/readonly-test/model +0 -0
- data/test/models/test/metadata +9 -9
- data/test/models/test/model +0 -0
- data/test/test_hoatzin.rb +3 -2
- metadata +34 -22
data/LICENSE.txt
CHANGED
@@ -18,3 +18,9 @@ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
18
18
|
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
19
|
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
20
|
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
21
|
+
|
22
|
+
Portions of this software are licensed under the MIT License, as specified in
|
23
|
+
their original form (from https://github.com/josephwilk/rsemantic) :
|
24
|
+
|
25
|
+
vector_space/builder.rb
|
26
|
+
vector_space/model.rb
|
data/README.markdown
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# hoatzin
|
2
2
|
|
3
|
-
Hoatzin is a text classifier in Ruby that uses
|
3
|
+
Hoatzin is a text classifier in Ruby that uses libsvm for it's classification.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -19,8 +19,8 @@ gem install hoatzin
|
|
19
19
|
|
20
20
|
## Storage
|
21
21
|
|
22
|
-
The Hoatzin classifier supports saving your trained classifier to the filesystem. It
|
23
|
-
|
22
|
+
The Hoatzin classifier supports saving your trained classifier to the filesystem. It stores
|
23
|
+
the generated libsvm model and the required metadata as two separate files.
|
24
24
|
|
25
25
|
# Load a previously trained classifier
|
26
26
|
c = Hoatzin::Classifier.new(:metadata => '/path/to/file', :model => '/path/to/file')
|
@@ -36,11 +36,45 @@ to store the libsvm model and the associated metadata as two separate files.
|
|
36
36
|
The classifier can continue to be trained if the model is saved with the :update => true option,
|
37
37
|
however the files stored on the filesystem will be much larger as they will contain copies
|
38
38
|
of all the documents used during training the classifier. It is generally advised to save without
|
39
|
-
the :update => true option unless it is
|
39
|
+
the :update => true option unless it is required.
|
40
|
+
|
41
|
+
## Training
|
42
|
+
|
43
|
+
The #train method doesn't calculate all the required information for classification
|
44
|
+
(in particular the feature vectors) due to the time they take to recompute for each new
|
45
|
+
token generated when adding a document for training. This means that there can be a delay
|
46
|
+
when calling the #classify method for the first time whilst all the required information
|
47
|
+
is prepared. This preparation step can be explicitly called using the #sync method. This
|
48
|
+
method is transparently called by the #classify method when required. Sample usage of the #sync
|
49
|
+
method is shown below:
|
50
|
+
|
51
|
+
# Create a hoatzin classifier
|
52
|
+
c = Hoatzin::Classifier.new()
|
53
|
+
|
54
|
+
# Add the training data to the classifier
|
55
|
+
corpus.each do |doc|
|
56
|
+
c.train(doc[:classification], doc[:text])
|
57
|
+
end
|
58
|
+
|
59
|
+
# Force the calculation of the feature vectors and
|
60
|
+
# preparation of the SVM model. This can take some
|
61
|
+
# time if the corpus is large
|
62
|
+
c.sync
|
63
|
+
|
64
|
+
# Save the model and associated meta-data so we don't have to
|
65
|
+
# call sync again and wait for the feature vectors to be computed
|
66
|
+
c.save(:metadata => '/path/to/metadata', :model => '/path/to/model')
|
67
|
+
|
68
|
+
# Now call classify
|
69
|
+
c.classify("Spectacular show")
|
70
|
+
|
71
|
+
The saved model and metadata can be loaded again for classification, avoiding the
|
72
|
+
need to recompute the feature vectors.
|
40
73
|
|
41
74
|
## Acknowledgements
|
42
75
|
|
43
76
|
See http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/ for the original inspiration.
|
77
|
+
The Vector Space Model implementation is adapted from https://github.com/josephwilk/rsemantic
|
44
78
|
|
45
79
|
## Copyright and License
|
46
80
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/hoatzin.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{hoatzin}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["robl"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2011-01-03}
|
13
13
|
s.description = %q{Hoatzin is a text classifier in Ruby that uses SVM for it's classification.}
|
14
14
|
s.email = %q{robl@rjlee.net}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -24,7 +24,11 @@ Gem::Specification.new do |s|
|
|
24
24
|
"Rakefile",
|
25
25
|
"VERSION",
|
26
26
|
"hoatzin.gemspec",
|
27
|
+
"lib/classifier.rb",
|
27
28
|
"lib/hoatzin.rb",
|
29
|
+
"lib/parser.rb",
|
30
|
+
"lib/vector_space/builder.rb",
|
31
|
+
"lib/vector_space/model.rb",
|
28
32
|
"test/helper.rb",
|
29
33
|
"test/models/readonly-test/metadata",
|
30
34
|
"test/models/readonly-test/model",
|
data/lib/classifier.rb
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
module Hoatzin
|
2
|
+
class Classifier
|
3
|
+
|
4
|
+
class ReadOnly < Exception; end
|
5
|
+
class InvalidFormat < Exception; end
|
6
|
+
|
7
|
+
FORMAT_VERSION = 2
|
8
|
+
|
9
|
+
attr_reader :classifications
|
10
|
+
|
11
|
+
def initialize options = {}
|
12
|
+
|
13
|
+
@documents = []
|
14
|
+
@classifications = []
|
15
|
+
@labels = []
|
16
|
+
|
17
|
+
@problem = @model = nil
|
18
|
+
@cache = 0
|
19
|
+
@readonly = false
|
20
|
+
|
21
|
+
@metadata_file = options.delete(:metadata) || nil
|
22
|
+
@model_file = options.delete(:model) || nil
|
23
|
+
|
24
|
+
@builder = VectorSpace::Builder.new(:parser => Hoatzin::Parser.new)
|
25
|
+
|
26
|
+
# If we have model and metadata files then load them
|
27
|
+
load if @metadata_file && @model_file
|
28
|
+
|
29
|
+
|
30
|
+
# Define kernel parameters for libsvm
|
31
|
+
@parameters = Parameter.new(:C => 100,
|
32
|
+
:degree => 1,
|
33
|
+
:coef0 => 0,
|
34
|
+
:eps => 0.001)
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
def train classification, text
|
39
|
+
# Only allow retraining if we have all the required data
|
40
|
+
raise ReadOnly if @readonly
|
41
|
+
|
42
|
+
# Add the classification if we haven't seen it before
|
43
|
+
@classifications << classification unless @classifications.include?(classification)
|
44
|
+
|
45
|
+
# Add to document corpus
|
46
|
+
@documents << text
|
47
|
+
|
48
|
+
# Add classification to classification list
|
49
|
+
@labels << @classifications.index(classification)
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
def classify text
|
54
|
+
|
55
|
+
# See if we need to calculate the feature vectors
|
56
|
+
sync
|
57
|
+
|
58
|
+
# Calculate the feature vectors for the text to be classified
|
59
|
+
f_vector = @builder.build_query_vector(text)
|
60
|
+
|
61
|
+
# Classify and return classification
|
62
|
+
pred, probs = @model.predict_probability(f_vector)
|
63
|
+
@classifications[pred.to_i]
|
64
|
+
end
|
65
|
+
|
66
|
+
def sync
|
67
|
+
# Only update the model if we've trained more documents since it was last updated
|
68
|
+
if !@readonly && @documents.length > @cache
|
69
|
+
return nil if @documents.length == 0
|
70
|
+
@cache = @documents.length
|
71
|
+
assign_model
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def save options = {}
|
76
|
+
@metadata_file = options[:metadata] if options.key?(:metadata)
|
77
|
+
@model_file = options[:model] if options.key?(:model)
|
78
|
+
return false unless (@metadata_file && @model_file)
|
79
|
+
|
80
|
+
# TODO: Add a version identifier
|
81
|
+
data = { :classifications => @classifications,
|
82
|
+
:version => FORMAT_VERSION,
|
83
|
+
:dictionary => @builder.vector_keyword_index,
|
84
|
+
:readonly => true }
|
85
|
+
data.merge!(:documents => @documents,
|
86
|
+
:cache => @cache,
|
87
|
+
:readonly => false) if options[:update]
|
88
|
+
File.open(@metadata_file, 'w+') { |f| Marshal.dump(data, f) }
|
89
|
+
assign_model if @model.nil?
|
90
|
+
@model.save(@model_file)
|
91
|
+
end
|
92
|
+
|
93
|
+
protected
|
94
|
+
def load
|
95
|
+
data = {}
|
96
|
+
File.open(@metadata_file) { |f| data = Marshal.load(f) }
|
97
|
+
raise InvalidFormat if !data.key?(:version) || data[:version] != FORMAT_VERSION
|
98
|
+
@classifications = data[:classifications]
|
99
|
+
@readonly = data[:readonly]
|
100
|
+
@builder.vector_keyword_index = data[:dictionary]
|
101
|
+
unless @readonly
|
102
|
+
@documents = data[:documents]
|
103
|
+
@cache = data[:cache]
|
104
|
+
end
|
105
|
+
@model = Model.new(@model_file)
|
106
|
+
end
|
107
|
+
|
108
|
+
def assign_model
|
109
|
+
vector_space_model = @builder.build_document_matrix(@documents)
|
110
|
+
@problem = Problem.new(@labels, vector_space_model.matrix)
|
111
|
+
@model = Model.new(@problem, @parameters)
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
end
|
data/lib/hoatzin.rb
CHANGED
@@ -1,189 +1,9 @@
|
|
1
1
|
require 'svm'
|
2
2
|
require 'fast_stemmer'
|
3
3
|
require 'iconv'
|
4
|
+
require 'pp'
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
attr_reader :classifications
|
11
|
-
|
12
|
-
def initialize options = {}
|
13
|
-
|
14
|
-
@metadata_file = options.delete(:metadata) || nil
|
15
|
-
@model_file = options.delete(:model) || nil
|
16
|
-
@documents = []
|
17
|
-
@dictionary = []
|
18
|
-
@classifications = []
|
19
|
-
@labels = []
|
20
|
-
@feature_vectors = []
|
21
|
-
@problem = @model = nil
|
22
|
-
@cache = 0
|
23
|
-
@readonly = false
|
24
|
-
|
25
|
-
# If we have model and metadata files then load them
|
26
|
-
load if @metadata_file && @model_file
|
27
|
-
|
28
|
-
# Define kernel parameters for libsvm
|
29
|
-
@parameters = Parameter.new(:C => 100,
|
30
|
-
:degree => 1,
|
31
|
-
:coef0 => 0,
|
32
|
-
:eps => 0.001)
|
33
|
-
|
34
|
-
end
|
35
|
-
|
36
|
-
def train classification, text
|
37
|
-
# Only allow retraining if we have all the required data
|
38
|
-
raise ReadOnly if @readonly
|
39
|
-
|
40
|
-
# Add the classification if we haven't seen it before
|
41
|
-
@classifications << classification unless @classifications.include?(classification)
|
42
|
-
|
43
|
-
# Tokenize the text
|
44
|
-
tokens = Classifier.tokenize(text)
|
45
|
-
|
46
|
-
# Add tokens to word list
|
47
|
-
@dictionary << tokens
|
48
|
-
@dictionary.flatten!.uniq!
|
49
|
-
|
50
|
-
# Add to list of documents
|
51
|
-
@documents << tokens
|
52
|
-
|
53
|
-
# Add classification to classification list
|
54
|
-
@labels << @classifications.index(classification)
|
55
|
-
|
56
|
-
# Compute the feature vectors
|
57
|
-
@feature_vectors = @documents.map { |doc| @dictionary.map{|x| doc.include?(x) ? 1 : 0} }
|
58
|
-
end
|
59
|
-
|
60
|
-
def classify text
|
61
|
-
# Only update the model if we've trained more documents since it was last updated
|
62
|
-
if !@readonly && @documents.length > @cache
|
63
|
-
return nil if @documents.length == 0
|
64
|
-
@cache = @documents.length
|
65
|
-
assign_model
|
66
|
-
end
|
67
|
-
|
68
|
-
# Tokenize the text
|
69
|
-
tokens = Classifier.tokenize(text)
|
70
|
-
|
71
|
-
# Calculate the feature vectors for the text to be classified
|
72
|
-
f_vector = @dictionary.map{|x| tokens.include?(x) ? 1 : 0}
|
73
|
-
|
74
|
-
# Classify and return classification
|
75
|
-
pred, probs = @model.predict_probability(f_vector)
|
76
|
-
@classifications[pred.to_i]
|
77
|
-
end
|
78
|
-
|
79
|
-
def save options = {}
|
80
|
-
@metadata_file = options[:metadata] if options.key?(:metadata)
|
81
|
-
@model_file = options[:model] if options.key?(:model)
|
82
|
-
return false unless (@metadata_file && @model_file)
|
83
|
-
data = { :dictionary => @dictionary, :classifications => @classifications}
|
84
|
-
data.merge!(:documents => @documents,
|
85
|
-
:labels => @labels,
|
86
|
-
:feature_vectors => @feature_vectors,
|
87
|
-
:cache => @cache) if options[:update]
|
88
|
-
File.open(@metadata_file, 'w+') { |f| Marshal.dump(data, f) }
|
89
|
-
assign_model if @model.nil?
|
90
|
-
@model.save(@model_file)
|
91
|
-
end
|
92
|
-
|
93
|
-
protected
|
94
|
-
def load
|
95
|
-
data = {}
|
96
|
-
File.open(@metadata_file) { |f| data = Marshal.load(f) }
|
97
|
-
@dictionary = data[:dictionary]
|
98
|
-
@classifications = data[:classifications]
|
99
|
-
if data.key?(:documents)
|
100
|
-
@documents = data[:documents]
|
101
|
-
@labels = data[:labels]
|
102
|
-
@feature_vectors = data[:feature_vectors]
|
103
|
-
@cache = data[:cache]
|
104
|
-
end
|
105
|
-
@readonly = @documents.length > 0 ? false : true
|
106
|
-
@model = Model.new(@model_file)
|
107
|
-
end
|
108
|
-
|
109
|
-
def assign_model
|
110
|
-
@problem = Problem.new(@labels, @feature_vectors)
|
111
|
-
@model = Model.new(@problem, @parameters)
|
112
|
-
end
|
113
|
-
|
114
|
-
# Adapted from ankusa, to replace with tokenizer gem
|
115
|
-
def self.tokenize text
|
116
|
-
tokens = []
|
117
|
-
# from http://www.jroller.com/obie/tags/unicode
|
118
|
-
converter = Iconv.new('ASCII//IGNORE//TRANSLIT', 'UTF-8')
|
119
|
-
converter.iconv(text).unpack('U*').select { |cp| cp < 127 }.pack('U*') rescue ""
|
120
|
-
text.tr('-', ' ').gsub(/[^\w\s]/," ").split.each do |token|
|
121
|
-
tokens << token if (token.length > 3 && !Classifier.stop_words.include?(token))
|
122
|
-
end
|
123
|
-
tokens
|
124
|
-
end
|
125
|
-
|
126
|
-
# ftp://ftp.cs.cornell.edu/pub/smart/english.stop
|
127
|
-
def self.stop_words
|
128
|
-
%w{
|
129
|
-
a a's able about above according accordingly across actually after
|
130
|
-
afterwards again against ain't all allow allows almost alone along
|
131
|
-
already also although always am among amongst an and another any
|
132
|
-
anybody anyhow anyone anything anyway anyways anywhere apart appear
|
133
|
-
appreciate appropriate are aren't around as aside ask asking
|
134
|
-
associated at available away awfully b be became because become
|
135
|
-
becomes becoming been before beforehand behind being believe below
|
136
|
-
beside besides best better between beyond both brief but by c
|
137
|
-
c'mon c's came can can't cannot cant cause causes certain certainly
|
138
|
-
changes clearly co com come comes concerning consequently consider
|
139
|
-
considering contain containing contains corresponding could couldn't
|
140
|
-
course currently d definitely described despite did didn't different
|
141
|
-
do does doesn't doing don't done down downwards during e each edu
|
142
|
-
eg eight either else elsewhere enough entirely especially et etc
|
143
|
-
even ever every everybody everyone everything everywhere ex exactly
|
144
|
-
example except f far few fifth first five followed following follows
|
145
|
-
for former formerly forth four from further furthermore g get gets
|
146
|
-
getting given gives go goes going gone got gotten greetings h had
|
147
|
-
hadn't happens hardly has hasn't have haven't having he he's hello
|
148
|
-
help hence her here here's hereafter hereby herein hereupon hers
|
149
|
-
herself hi him himself his hither hopefully how howbeit however i
|
150
|
-
i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed
|
151
|
-
indicate indicated indicates inner insofar instead into inward is
|
152
|
-
isn't it it'd it'll it's its itself j just k keep keeps kept know
|
153
|
-
knows known l last lately later latter latterly least less lest let
|
154
|
-
let's like liked likely little look looking looks ltd m mainly many
|
155
|
-
may maybe me mean meanwhile merely might more moreover most mostly
|
156
|
-
much must my myself n name namely nd near nearly necessary need needs
|
157
|
-
neither never nevertheless new next nine no nobody non none noone
|
158
|
-
nor normally not nothing novel now nowhere o obviously of off often
|
159
|
-
oh ok okay old on once one ones only onto or other others otherwise
|
160
|
-
ought our ours ourselves out outside over overall own p particular
|
161
|
-
particularly per perhaps placed please plus possible presumably
|
162
|
-
probably provides q que quite qv r rather rd re really reasonably
|
163
|
-
regarding regardless regards relatively respectively right s said
|
164
|
-
same saw say saying says second secondly see seeing seem seemed
|
165
|
-
seeming seems seen self selves sensible sent serious seriously
|
166
|
-
seven several shall she should shouldn't since six so some somebody
|
167
|
-
somehow someone something sometime sometimes somewhat somewhere soon
|
168
|
-
sorry specified specify specifying still sub such sup sure t t's
|
169
|
-
take taken tell tends th than thank thanks thanx that that's thats
|
170
|
-
the their theirs them themselves then thence there there's thereafter
|
171
|
-
thereby therefore therein theres thereupon these they they'd they'll
|
172
|
-
they're they've think third this thorough thoroughly those though
|
173
|
-
three through throughout thru thus to together too took toward
|
174
|
-
towards tried tries truly try trying twice two u un under
|
175
|
-
unfortunately unless unlikely until unto up upon us use used useful
|
176
|
-
uses using usually uucp v value various very via viz vs w want wants
|
177
|
-
was wasn't way we we'd we'll we're we've welcome well went were weren't
|
178
|
-
what what's whatever when whence whenever where where's whereafter
|
179
|
-
whereas whereby wherein whereupon wherever whether which while
|
180
|
-
whither who who's whoever whole whom whose why will willing wish
|
181
|
-
with within without won't wonder would would wouldn't x y yes yet
|
182
|
-
you you'd you'll you're you've your yours yourself yourselves
|
183
|
-
z zero
|
184
|
-
}
|
185
|
-
end
|
186
|
-
|
187
|
-
|
188
|
-
end
|
189
|
-
end
|
6
|
+
require 'classifier'
|
7
|
+
require 'parser'
|
8
|
+
require 'vector_space/model'
|
9
|
+
require 'vector_space/builder'
|
data/lib/parser.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
module Hoatzin
|
2
|
+
class Parser
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
end
|
6
|
+
|
7
|
+
# Adapted from ankusa, to replace with tokenizer gem
|
8
|
+
def tokenize text
|
9
|
+
tokens = []
|
10
|
+
# from http://www.jroller.com/obie/tags/unicode
|
11
|
+
converter = Iconv.new('ASCII//IGNORE//TRANSLIT', 'UTF-8')
|
12
|
+
converter.iconv(text).unpack('U*').select { |cp| cp < 127 }.pack('U*') rescue ""
|
13
|
+
text.tr('-', ' ').gsub(/[^\w\s]/," ").split.each do |token|
|
14
|
+
token = token.stem
|
15
|
+
tokens << token if (token.length > 3 && !stop_words.include?(token))
|
16
|
+
end
|
17
|
+
tokens
|
18
|
+
end
|
19
|
+
|
20
|
+
# ftp://ftp.cs.cornell.edu/pub/smart/english.stop
|
21
|
+
def stop_words
|
22
|
+
%w{
|
23
|
+
a a's able about above according accordingly across actually after
|
24
|
+
afterwards again against ain't all allow allows almost alone along
|
25
|
+
already also although always am among amongst an and another any
|
26
|
+
anybody anyhow anyone anything anyway anyways anywhere apart appear
|
27
|
+
appreciate appropriate are aren't around as aside ask asking
|
28
|
+
associated at available away awfully b be became because become
|
29
|
+
becomes becoming been before beforehand behind being believe below
|
30
|
+
beside besides best better between beyond both brief but by c
|
31
|
+
c'mon c's came can can't cannot cant cause causes certain certainly
|
32
|
+
changes clearly co com come comes concerning consequently consider
|
33
|
+
considering contain containing contains corresponding could couldn't
|
34
|
+
course currently d definitely described despite did didn't different
|
35
|
+
do does doesn't doing don't done down downwards during e each edu
|
36
|
+
eg eight either else elsewhere enough entirely especially et etc
|
37
|
+
even ever every everybody everyone everything everywhere ex exactly
|
38
|
+
example except f far few fifth first five followed following follows
|
39
|
+
for former formerly forth four from further furthermore g get gets
|
40
|
+
getting given gives go goes going gone got gotten greetings h had
|
41
|
+
hadn't happens hardly has hasn't have haven't having he he's hello
|
42
|
+
help hence her here here's hereafter hereby herein hereupon hers
|
43
|
+
herself hi him himself his hither hopefully how howbeit however i
|
44
|
+
i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed
|
45
|
+
indicate indicated indicates inner insofar instead into inward is
|
46
|
+
isn't it it'd it'll it's its itself j just k keep keeps kept know
|
47
|
+
knows known l last lately later latter latterly least less lest let
|
48
|
+
let's like liked likely little look looking looks ltd m mainly many
|
49
|
+
may maybe me mean meanwhile merely might more moreover most mostly
|
50
|
+
much must my myself n name namely nd near nearly necessary need needs
|
51
|
+
neither never nevertheless new next nine no nobody non none noone
|
52
|
+
nor normally not nothing novel now nowhere o obviously of off often
|
53
|
+
oh ok okay old on once one ones only onto or other others otherwise
|
54
|
+
ought our ours ourselves out outside over overall own p particular
|
55
|
+
particularly per perhaps placed please plus possible presumably
|
56
|
+
probably provides q que quite qv r rather rd re really reasonably
|
57
|
+
regarding regardless regards relatively respectively right s said
|
58
|
+
same saw say saying says second secondly see seeing seem seemed
|
59
|
+
seeming seems seen self selves sensible sent serious seriously
|
60
|
+
seven several shall she should shouldn't since six so some somebody
|
61
|
+
somehow someone something sometime sometimes somewhat somewhere soon
|
62
|
+
sorry specified specify specifying still sub such sup sure t t's
|
63
|
+
take taken tell tends th than thank thanks thanx that that's thats
|
64
|
+
the their theirs them themselves then thence there there's thereafter
|
65
|
+
thereby therefore therein theres thereupon these they they'd they'll
|
66
|
+
they're they've think third this thorough thoroughly those though
|
67
|
+
three through throughout thru thus to together too took toward
|
68
|
+
towards tried tries truly try trying twice two u un under
|
69
|
+
unfortunately unless unlikely until unto up upon us use used useful
|
70
|
+
uses using usually uucp v value various very via viz vs w want wants
|
71
|
+
was wasn't way we we'd we'll we're we've welcome well went were weren't
|
72
|
+
what what's whatever when whence whenever where where's whereafter
|
73
|
+
whereas whereby wherein whereupon wherever whether which while
|
74
|
+
whither who who's whoever whole whom whose why will willing wish
|
75
|
+
with within without won't wonder would would wouldn't x y yes yet
|
76
|
+
you you'd you'll you're you've your yours yourself yourselves
|
77
|
+
z zero
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# Adapted from : https://github.com/josephwilk/rsemantic
|
2
|
+
|
3
|
+
module Hoatzin
|
4
|
+
module VectorSpace
|
5
|
+
#A algebraic model for representing text documents as vectors of identifiers.
|
6
|
+
#A document is represented as a vector. Each dimension of the vector corresponds to a
|
7
|
+
#separate term. If a term occurs in the document, then the value in the vector is non-zero.
|
8
|
+
class Builder
|
9
|
+
|
10
|
+
attr_accessor :vector_keyword_index
|
11
|
+
|
12
|
+
def initialize(options={})
|
13
|
+
@parser = options.delete(:parser)
|
14
|
+
@options = options
|
15
|
+
@parsed_document_cache = []
|
16
|
+
end
|
17
|
+
|
18
|
+
def build_document_matrix(documents)
|
19
|
+
@vector_keyword_index = build_vector_keyword_index(documents)
|
20
|
+
|
21
|
+
document_matrix = []
|
22
|
+
document_matrix += documents.enum_for(:each_with_index).map{|document,document_id| build_vector(document, document_id)}
|
23
|
+
|
24
|
+
Model.new(document_matrix, @vector_keyword_index)
|
25
|
+
end
|
26
|
+
|
27
|
+
def build_query_vector(text)
|
28
|
+
build_vector(text)
|
29
|
+
end
|
30
|
+
|
31
|
+
def marshal_dump
|
32
|
+
[@parser, @options, @parsed_document_cache, @vector_keyword_index]
|
33
|
+
end
|
34
|
+
|
35
|
+
def marshal_load(ary)
|
36
|
+
@parser, @options, @parsed_document_cache, @vector_keyword_index = ary
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
def build_vector_keyword_index(documents)
|
41
|
+
parse_and_cache(documents)
|
42
|
+
vocabulary_list = find_unique_vocabulary
|
43
|
+
map_vocabulary_to_vector_positions(vocabulary_list)
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse_and_cache(documents)
|
47
|
+
documents.each_with_index do |document, index|
|
48
|
+
@parsed_document_cache[index] = @parser.tokenize(document)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def find_unique_vocabulary
|
53
|
+
vocabulary_list = @parsed_document_cache.inject([]) { |parsed_document, vocabulary_list| vocabulary_list + parsed_document }
|
54
|
+
vocabulary_list.uniq
|
55
|
+
end
|
56
|
+
|
57
|
+
def map_vocabulary_to_vector_positions(vocabulary_list)
|
58
|
+
vector_index={}
|
59
|
+
column = 0
|
60
|
+
vocabulary_list.each do |word|
|
61
|
+
vector_index[word] = column
|
62
|
+
column += 1
|
63
|
+
end
|
64
|
+
vector_index
|
65
|
+
end
|
66
|
+
|
67
|
+
def build_vector(word_string, document_id=nil)
|
68
|
+
if document_id.nil?
|
69
|
+
word_list = @parser.tokenize(word_string)
|
70
|
+
else
|
71
|
+
word_list = @parsed_document_cache[document_id]
|
72
|
+
end
|
73
|
+
|
74
|
+
vector = Array.new(@vector_keyword_index.length, 0)
|
75
|
+
word_list.each { |word| vector[@vector_keyword_index[word]] += 1 if @vector_keyword_index.has_key?(word) }
|
76
|
+
vector
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# Adapted from : https://github.com/josephwilk/rsemantic
|
2
|
+
|
3
|
+
#require 'stringio'
|
4
|
+
|
5
|
+
module Hoatzin
|
6
|
+
module VectorSpace
|
7
|
+
|
8
|
+
class Model
|
9
|
+
|
10
|
+
def initialize(matrix, keywords)
|
11
|
+
@keywords = keywords || {}
|
12
|
+
@_dc_obj = matrix
|
13
|
+
end
|
14
|
+
|
15
|
+
def matrix=(matrix)
|
16
|
+
@_dc_obj = matrix
|
17
|
+
end
|
18
|
+
|
19
|
+
def matrix
|
20
|
+
@_dc_obj
|
21
|
+
end
|
22
|
+
|
23
|
+
# def to_s
|
24
|
+
# out = StringIO.new
|
25
|
+
# out.print " " * 9
|
26
|
+
#
|
27
|
+
# matrix.ncol.times do |id|
|
28
|
+
# out.print " D#{id+1} "
|
29
|
+
# end
|
30
|
+
# out.puts
|
31
|
+
#
|
32
|
+
# matrix.rows.each_with_index do |terms, index|
|
33
|
+
# out.print "#{@keywords.index(index).ljust(6)}" if @keywords.has_value?(index)
|
34
|
+
# out.print "[ "
|
35
|
+
# terms.columns.each do |document|
|
36
|
+
# out.print "%+0.2f " % document
|
37
|
+
# end
|
38
|
+
# out.print "]"
|
39
|
+
# out.puts
|
40
|
+
# end
|
41
|
+
# out.string
|
42
|
+
# end
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -1,16 +1,16 @@
|
|
1
1
|
svm_type c_svc
|
2
2
|
kernel_type rbf
|
3
|
-
gamma 0.
|
3
|
+
gamma 0.0833333
|
4
4
|
nr_class 2
|
5
5
|
total_sv 7
|
6
|
-
rho -0.
|
6
|
+
rho -0.44634
|
7
7
|
label 0 1
|
8
8
|
nr_sv 4 3
|
9
9
|
SV
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
-
|
15
|
-
-0.
|
16
|
-
-
|
10
|
+
0.09624983775649808 0:0 1:0 2:0 3:0 4:0 5:1 6:0 7:0 8:1 9:0 10:0 11:1
|
11
|
+
1.421056914116459 0:0 1:0 2:0 3:0 4:0 5:1 6:0 7:0 8:0 9:1 10:1 11:0
|
12
|
+
3.139840844768948 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:0 10:0 11:0
|
13
|
+
2.541431764584626 0:0 1:0 2:0 3:0 4:0 5:1 6:1 7:0 8:0 9:0 10:0 11:0
|
14
|
+
-0.1202843540395405 0:1 1:0 2:0 3:1 4:1 5:0 6:0 7:0 8:0 9:0 10:0 11:0
|
15
|
+
-0.7832988164395933 0:1 1:1 2:1 3:1 4:1 5:0 6:0 7:0 8:0 9:0 10:0 11:0
|
16
|
+
-6.294996190747396 0:1 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0
|
Binary file
|
data/test/models/test/metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
svm_type c_svc
|
2
2
|
kernel_type rbf
|
3
|
-
gamma 0.
|
3
|
+
gamma 0.0833333
|
4
4
|
nr_class 2
|
5
5
|
total_sv 7
|
6
|
-
rho -0.
|
6
|
+
rho -0.44634
|
7
7
|
label 0 1
|
8
8
|
nr_sv 4 3
|
9
9
|
SV
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
-
|
15
|
-
-0.
|
16
|
-
-
|
10
|
+
0.09624983775649808 0:0 1:0 2:0 3:0 4:0 5:1 6:0 7:0 8:1 9:0 10:0 11:1
|
11
|
+
1.421056914116459 0:0 1:0 2:0 3:0 4:0 5:1 6:0 7:0 8:0 9:1 10:1 11:0
|
12
|
+
3.139840844768948 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:1 8:1 9:0 10:0 11:0
|
13
|
+
2.541431764584626 0:0 1:0 2:0 3:0 4:0 5:1 6:1 7:0 8:0 9:0 10:0 11:0
|
14
|
+
-0.1202843540395405 0:1 1:0 2:0 3:1 4:1 5:0 6:0 7:0 8:0 9:0 10:0 11:0
|
15
|
+
-0.7832988164395933 0:1 1:1 2:1 3:1 4:1 5:0 6:0 7:0 8:0 9:0 10:0 11:0
|
16
|
+
-6.294996190747396 0:1 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0
|
data/test/models/test/model
CHANGED
Binary file
|
data/test/test_hoatzin.rb
CHANGED
@@ -9,7 +9,7 @@ class TestHoatzin < Test::Unit::TestCase
|
|
9
9
|
end
|
10
10
|
|
11
11
|
should "support training and classification" do
|
12
|
-
assert_equal @c.train(:positive, "Thats nice"), [[1, 1]]
|
12
|
+
assert_equal @c.train(:positive, "Thats nice"), [0] #[[1, 1]]
|
13
13
|
assert_equal @c.classify("Thats nice"), :positive
|
14
14
|
end
|
15
15
|
|
@@ -22,10 +22,10 @@ class TestHoatzin < Test::Unit::TestCase
|
|
22
22
|
end
|
23
23
|
|
24
24
|
should "classify the test set correctly" do
|
25
|
-
#@c.save(:metadata => METADATA_FILE, :model => MODEL_FILE, :update => true)
|
26
25
|
TESTING_LABELS.each_with_index do |label, index|
|
27
26
|
assert_equal @c.classify(TESTING_DOCS[index]), label
|
28
27
|
end
|
28
|
+
#@c.save(:metadata => READONLY_METADATA_FILE, :model => READONLY_MODEL_FILE, :update => false)
|
29
29
|
end
|
30
30
|
|
31
31
|
should "return the classifications" do
|
@@ -71,4 +71,5 @@ class TestHoatzin < Test::Unit::TestCase
|
|
71
71
|
end
|
72
72
|
|
73
73
|
end
|
74
|
+
|
74
75
|
end
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hoatzin
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
|
-
-
|
8
|
+
- 2
|
8
9
|
- 0
|
9
|
-
version: 0.
|
10
|
+
version: 0.2.0
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- robl
|
@@ -14,91 +15,97 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date:
|
18
|
+
date: 2011-01-03 00:00:00 +00:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
22
|
+
prerelease: false
|
21
23
|
name: libsvm-ruby-swig
|
22
|
-
|
24
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
23
25
|
none: false
|
24
26
|
requirements:
|
25
27
|
- - ">="
|
26
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
27
30
|
segments:
|
28
31
|
- 0
|
29
32
|
version: "0"
|
33
|
+
requirement: *id001
|
30
34
|
type: :runtime
|
31
|
-
prerelease: false
|
32
|
-
version_requirements: *id001
|
33
35
|
- !ruby/object:Gem::Dependency
|
36
|
+
prerelease: false
|
34
37
|
name: fast-stemmer
|
35
|
-
|
38
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
36
39
|
none: false
|
37
40
|
requirements:
|
38
41
|
- - ">="
|
39
42
|
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
40
44
|
segments:
|
41
45
|
- 0
|
42
46
|
version: "0"
|
47
|
+
requirement: *id002
|
43
48
|
type: :runtime
|
44
|
-
prerelease: false
|
45
|
-
version_requirements: *id002
|
46
49
|
- !ruby/object:Gem::Dependency
|
50
|
+
prerelease: false
|
47
51
|
name: shoulda
|
48
|
-
|
52
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
49
53
|
none: false
|
50
54
|
requirements:
|
51
55
|
- - ">="
|
52
56
|
- !ruby/object:Gem::Version
|
57
|
+
hash: 3
|
53
58
|
segments:
|
54
59
|
- 0
|
55
60
|
version: "0"
|
61
|
+
requirement: *id003
|
56
62
|
type: :development
|
57
|
-
prerelease: false
|
58
|
-
version_requirements: *id003
|
59
63
|
- !ruby/object:Gem::Dependency
|
64
|
+
prerelease: false
|
60
65
|
name: bundler
|
61
|
-
|
66
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
62
67
|
none: false
|
63
68
|
requirements:
|
64
69
|
- - ~>
|
65
70
|
- !ruby/object:Gem::Version
|
71
|
+
hash: 23
|
66
72
|
segments:
|
67
73
|
- 1
|
68
74
|
- 0
|
69
75
|
- 0
|
70
76
|
version: 1.0.0
|
77
|
+
requirement: *id004
|
71
78
|
type: :development
|
72
|
-
prerelease: false
|
73
|
-
version_requirements: *id004
|
74
79
|
- !ruby/object:Gem::Dependency
|
80
|
+
prerelease: false
|
75
81
|
name: jeweler
|
76
|
-
|
82
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
77
83
|
none: false
|
78
84
|
requirements:
|
79
85
|
- - ~>
|
80
86
|
- !ruby/object:Gem::Version
|
87
|
+
hash: 7
|
81
88
|
segments:
|
82
89
|
- 1
|
83
90
|
- 5
|
84
91
|
- 2
|
85
92
|
version: 1.5.2
|
93
|
+
requirement: *id005
|
86
94
|
type: :development
|
87
|
-
prerelease: false
|
88
|
-
version_requirements: *id005
|
89
95
|
- !ruby/object:Gem::Dependency
|
96
|
+
prerelease: false
|
90
97
|
name: rcov
|
91
|
-
|
98
|
+
version_requirements: &id006 !ruby/object:Gem::Requirement
|
92
99
|
none: false
|
93
100
|
requirements:
|
94
101
|
- - ">="
|
95
102
|
- !ruby/object:Gem::Version
|
103
|
+
hash: 3
|
96
104
|
segments:
|
97
105
|
- 0
|
98
106
|
version: "0"
|
107
|
+
requirement: *id006
|
99
108
|
type: :development
|
100
|
-
prerelease: false
|
101
|
-
version_requirements: *id006
|
102
109
|
description: Hoatzin is a text classifier in Ruby that uses SVM for it's classification.
|
103
110
|
email: robl@rjlee.net
|
104
111
|
executables: []
|
@@ -116,7 +123,11 @@ files:
|
|
116
123
|
- Rakefile
|
117
124
|
- VERSION
|
118
125
|
- hoatzin.gemspec
|
126
|
+
- lib/classifier.rb
|
119
127
|
- lib/hoatzin.rb
|
128
|
+
- lib/parser.rb
|
129
|
+
- lib/vector_space/builder.rb
|
130
|
+
- lib/vector_space/model.rb
|
120
131
|
- test/helper.rb
|
121
132
|
- test/models/readonly-test/metadata
|
122
133
|
- test/models/readonly-test/model
|
@@ -137,7 +148,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
137
148
|
requirements:
|
138
149
|
- - ">="
|
139
150
|
- !ruby/object:Gem::Version
|
140
|
-
hash:
|
151
|
+
hash: 3
|
141
152
|
segments:
|
142
153
|
- 0
|
143
154
|
version: "0"
|
@@ -146,6 +157,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
146
157
|
requirements:
|
147
158
|
- - ">="
|
148
159
|
- !ruby/object:Gem::Version
|
160
|
+
hash: 3
|
149
161
|
segments:
|
150
162
|
- 0
|
151
163
|
version: "0"
|