svm_toolkit 1.1.7-java → 1.1.8-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE.rdoc +59 -59
- data/README.rdoc +99 -103
- data/bin/svm-demo +354 -354
- data/lib/libsvm.jar +0 -0
- data/lib/svm_toolkit/evaluators.rb +169 -169
- data/lib/svm_toolkit/model.rb +122 -124
- data/lib/svm_toolkit/node.rb +17 -21
- data/lib/svm_toolkit/parameter.rb +114 -117
- data/lib/svm_toolkit/problem.rb +294 -308
- data/lib/svm_toolkit/svm.rb +219 -224
- data/lib/svm_toolkit.rb +37 -37
- metadata +26 -12
data/lib/svm_toolkit/problem.rb
CHANGED
|
@@ -1,308 +1,294 @@
|
|
|
1
|
-
module SvmToolkit
|
|
2
|
-
|
|
3
|
-
#
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# *
|
|
14
|
-
# *
|
|
15
|
-
#
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
problem =
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
#
|
|
56
|
-
#
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
max_index
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
# --
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
instances
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
instance
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
current_min = x[index][col].value
|
|
296
|
-
end
|
|
297
|
-
if x[index][col].value > current_max
|
|
298
|
-
current_max = x[index][col].value
|
|
299
|
-
end
|
|
300
|
-
end
|
|
301
|
-
# -- then update each value
|
|
302
|
-
self.l.times do |index|
|
|
303
|
-
x[index][col].value = ((max_value - min_value) * (x[index][col].value - current_min) / (current_max - current_min)) + min_value
|
|
304
|
-
end
|
|
305
|
-
end
|
|
306
|
-
end
|
|
307
|
-
end
|
|
308
|
-
|
|
1
|
+
module SvmToolkit
|
|
2
|
+
|
|
3
|
+
# Holds a set of labelled data.
|
|
4
|
+
class Problem
|
|
5
|
+
|
|
6
|
+
# Support constructing a problem from arrays of numbers (floating-point values).
|
|
7
|
+
#
|
|
8
|
+
# * instances - an array of instances, each instance being an array of numbers.
|
|
9
|
+
# * labels - an array of numbers, forming the labels for each instance.
|
|
10
|
+
#
|
|
11
|
+
# An ArgumentError exception is raised if all the following conditions are not met:
|
|
12
|
+
# * the number of instances should equal the number of labels,
|
|
13
|
+
# * there must be at least one instance, and
|
|
14
|
+
# * every instance must have the same number of features.
|
|
15
|
+
#
|
|
16
|
+
def self.from_array(instances, labels)
|
|
17
|
+
unless instances.size == labels.size
|
|
18
|
+
raise ArgumentError.new "Number of instances must equal number of labels"
|
|
19
|
+
end
|
|
20
|
+
unless instances.size > 0
|
|
21
|
+
raise ArgumentError.new "There must be at least one instance."
|
|
22
|
+
end
|
|
23
|
+
unless instances.collect {|i| i.size}.min == instances.collect {|i| i.size}.max
|
|
24
|
+
raise ArgumentError.new "All instances must have the same size"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
problem = Problem.new
|
|
28
|
+
problem.l = labels.size
|
|
29
|
+
# -- add in the training data
|
|
30
|
+
problem.x = Node[instances.size, instances[0].size].new
|
|
31
|
+
instances.each_with_index do |instance, i|
|
|
32
|
+
instance.each_with_index do |v, j|
|
|
33
|
+
problem.x[i][j] = Node.new(j, v)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
# -- add in the labels
|
|
37
|
+
problem.y = Java::double[labels.size].new
|
|
38
|
+
labels.each_with_index do |v, i|
|
|
39
|
+
problem.y[i] = v
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
return problem
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# To select SvmLight input file format
|
|
46
|
+
SvmLight = 0
|
|
47
|
+
|
|
48
|
+
# To select Csv input file format
|
|
49
|
+
Csv = 1
|
|
50
|
+
|
|
51
|
+
# To select ARFF input file format
|
|
52
|
+
Arff = 2
|
|
53
|
+
|
|
54
|
+
#
|
|
55
|
+
# Read in a problem definition from a given filename,
|
|
56
|
+
# using format SvmLight (default), Csv or Arff.
|
|
57
|
+
def self.from_file(filename, format = SvmLight)
|
|
58
|
+
case format
|
|
59
|
+
when SvmLight
|
|
60
|
+
return Problem.from_file_svmlight filename
|
|
61
|
+
when Csv
|
|
62
|
+
return Problem.from_file_csv filename
|
|
63
|
+
when Arff
|
|
64
|
+
return Problem.from_file_arff filename
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Read in a problem definition in svmlight format.
|
|
69
|
+
def self.from_file_svmlight filename
|
|
70
|
+
instances = []
|
|
71
|
+
labels = []
|
|
72
|
+
max_index = 0
|
|
73
|
+
IO.foreach(filename) do |line|
|
|
74
|
+
tokens = line.split(" ")
|
|
75
|
+
labels << tokens[0].to_f
|
|
76
|
+
instance = []
|
|
77
|
+
tokens[1..-1].each do |feature|
|
|
78
|
+
index, value = feature.split(":")
|
|
79
|
+
instance << Node.new(index.to_i, value.to_f)
|
|
80
|
+
max_index = [index.to_i, max_index].max
|
|
81
|
+
end
|
|
82
|
+
instances << instance
|
|
83
|
+
end
|
|
84
|
+
max_index += 1 # to allow for 0 position
|
|
85
|
+
unless instances.size == labels.size
|
|
86
|
+
raise ArgumentError.new "Number of labels read differs from number of instances"
|
|
87
|
+
end
|
|
88
|
+
# now create a Problem definition
|
|
89
|
+
problem = Problem.new
|
|
90
|
+
problem.l = instances.size
|
|
91
|
+
# -- add in the training data
|
|
92
|
+
problem.x = Node[instances.size, max_index].new
|
|
93
|
+
# -- fill with blank nodes
|
|
94
|
+
instances.size.times do |i|
|
|
95
|
+
max_index.times do |j|
|
|
96
|
+
problem.x[i][j] = Node.new(i, 0)
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
# -- add known values
|
|
100
|
+
instances.each_with_index do |instance, i|
|
|
101
|
+
instance.each do |node|
|
|
102
|
+
problem.x[i][node.index] = node
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
# -- add in the labels
|
|
106
|
+
problem.y = Java::double[labels.size].new
|
|
107
|
+
labels.each_with_index do |v, i|
|
|
108
|
+
problem.y[i] = v
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
return problem
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Read in a problem definition in csv format from given filename.
|
|
115
|
+
def self.from_file_csv filename
|
|
116
|
+
instances = []
|
|
117
|
+
labels = []
|
|
118
|
+
max_index = 0
|
|
119
|
+
csv_data = CSV.parse(File.read(filename), headers: false)
|
|
120
|
+
csv_data.each do |tokens|
|
|
121
|
+
labels << tokens[0].to_f
|
|
122
|
+
instance = []
|
|
123
|
+
tokens[1..-1].each_with_index do |value, index|
|
|
124
|
+
instance << Node.new(index, value.to_f)
|
|
125
|
+
end
|
|
126
|
+
max_index = [tokens.size, max_index].max
|
|
127
|
+
instances << instance
|
|
128
|
+
end
|
|
129
|
+
max_index += 1 # to allow for 0 position
|
|
130
|
+
unless instances.size == labels.size
|
|
131
|
+
raise ArgumentError.new "Number of labels read differs from number of instances"
|
|
132
|
+
end
|
|
133
|
+
# now create a Problem definition
|
|
134
|
+
problem = Problem.new
|
|
135
|
+
problem.l = instances.size
|
|
136
|
+
# -- add in the training data
|
|
137
|
+
problem.x = Node[instances.size, max_index].new
|
|
138
|
+
# -- fill with blank nodes
|
|
139
|
+
instances.size.times do |i|
|
|
140
|
+
max_index.times do |j|
|
|
141
|
+
problem.x[i][j] = Node.new(i, 0)
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
# -- add known values
|
|
145
|
+
instances.each_with_index do |instance, i|
|
|
146
|
+
instance.each do |node|
|
|
147
|
+
problem.x[i][node.index] = node
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
# -- add in the labels
|
|
151
|
+
problem.y = Java::double[labels.size].new
|
|
152
|
+
labels.each_with_index do |v, i|
|
|
153
|
+
problem.y[i] = v
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
return problem
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Read in a problem definition in arff format, from given filename.
|
|
160
|
+
# Assumes all values are numbers (non-numbers converted to 0.0),
|
|
161
|
+
# and that the class is the last field.
|
|
162
|
+
def self.from_file_arff filename
|
|
163
|
+
instances = []
|
|
164
|
+
labels = []
|
|
165
|
+
max_index = 0
|
|
166
|
+
found_data = false
|
|
167
|
+
IO.foreach(filename) do |line|
|
|
168
|
+
unless found_data
|
|
169
|
+
puts "Ignoring", line
|
|
170
|
+
found_data = line.downcase.strip == "@data"
|
|
171
|
+
next # repeat the loop
|
|
172
|
+
end
|
|
173
|
+
tokens = line.split(",")
|
|
174
|
+
labels << tokens.last.to_f
|
|
175
|
+
instance = []
|
|
176
|
+
tokens[1...-1].each_with_index do |value, index|
|
|
177
|
+
instance << Node.new(index, value.to_f)
|
|
178
|
+
end
|
|
179
|
+
max_index = [tokens.size, max_index].max
|
|
180
|
+
instances << instance
|
|
181
|
+
end
|
|
182
|
+
max_index += 1 # to allow for 0 position
|
|
183
|
+
unless instances.size == labels.size
|
|
184
|
+
raise ArgumentError.new "Number of labels read differs from number of instances"
|
|
185
|
+
end
|
|
186
|
+
# now create a Problem definition
|
|
187
|
+
problem = Problem.new
|
|
188
|
+
problem.l = instances.size
|
|
189
|
+
# -- add in the training data
|
|
190
|
+
problem.x = Node[instances.size, max_index].new
|
|
191
|
+
# -- fill with blank nodes
|
|
192
|
+
instances.size.times do |i|
|
|
193
|
+
max_index.times do |j|
|
|
194
|
+
problem.x[i][j] = Node.new(i, 0)
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
# -- add known values
|
|
198
|
+
instances.each_with_index do |instance, i|
|
|
199
|
+
instance.each do |node|
|
|
200
|
+
problem.x[i][node.index] = node
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
# -- add in the labels
|
|
204
|
+
problem.y = Java::double[labels.size].new
|
|
205
|
+
labels.each_with_index do |v, i|
|
|
206
|
+
problem.y[i] = v
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
return problem
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Returns the number of instances
|
|
213
|
+
def size
|
|
214
|
+
self.l
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Return label of nth instance
|
|
218
|
+
def label(n)
|
|
219
|
+
self.y[n]
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Return array of values for nth instance
|
|
223
|
+
def values(n)
|
|
224
|
+
self.x[n].collect { it.value }
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# Rescale values within problem to be in range min_value to max_value
|
|
228
|
+
#
|
|
229
|
+
# For SVM models, it is recommended all features be in range [0,1] or [-1,1]
|
|
230
|
+
def rescale(min_value = 0.0, max_value = 1.0)
|
|
231
|
+
return if self.l.zero?
|
|
232
|
+
x[0].size.times do |i|
|
|
233
|
+
rescale_column(i, min_value, max_value)
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
# Create a new problem by combining the instances in this problem with
|
|
238
|
+
# those in the given problem.
|
|
239
|
+
def merge problem
|
|
240
|
+
unless self.x[0].size == problem.x[0].size
|
|
241
|
+
raise ArgumentError.new "Cannot merge two problems with different numbers of features"
|
|
242
|
+
end
|
|
243
|
+
num_features = self.x[0].size
|
|
244
|
+
num_instances = size + problem.size
|
|
245
|
+
|
|
246
|
+
new_problem = Problem.new
|
|
247
|
+
new_problem.l = num_instances
|
|
248
|
+
new_problem.x = Node[num_instances, num_features].new
|
|
249
|
+
new_problem.y = Java::double[num_instances].new
|
|
250
|
+
# fill out the features
|
|
251
|
+
num_instances.times do |i|
|
|
252
|
+
num_features.times do |j|
|
|
253
|
+
if i < size
|
|
254
|
+
new_problem.x[i][j] = self.x[i][j]
|
|
255
|
+
else
|
|
256
|
+
new_problem.x[i][j] = problem.x[i-size][j]
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
# fill out the labels
|
|
261
|
+
num_instances.times do |i|
|
|
262
|
+
if i < size
|
|
263
|
+
new_problem.y[i] = self.y[i]
|
|
264
|
+
else
|
|
265
|
+
new_problem.y[i] = problem.y[i-size]
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
return new_problem
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
# Rescale values within problem for given column index,
|
|
273
|
+
# to be in range min_value to max_value
|
|
274
|
+
private
|
|
275
|
+
def rescale_column(col, min_value, max_value)
|
|
276
|
+
# -- first locate the column's range
|
|
277
|
+
current_min = x[0][col].value
|
|
278
|
+
current_max = x[0][col].value
|
|
279
|
+
self.l.times do |index|
|
|
280
|
+
if x[index][col].value < current_min
|
|
281
|
+
current_min = x[index][col].value
|
|
282
|
+
end
|
|
283
|
+
if x[index][col].value > current_max
|
|
284
|
+
current_max = x[index][col].value
|
|
285
|
+
end
|
|
286
|
+
end
|
|
287
|
+
# -- then update each value
|
|
288
|
+
self.l.times do |index|
|
|
289
|
+
x[index][col].value = ((max_value - min_value) * (x[index][col].value - current_min) / (current_max - current_min)) + min_value
|
|
290
|
+
end
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
|