svm_toolkit 1.1.7-java → 1.1.8-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,308 +1,294 @@
1
- module SvmToolkit
2
-
3
- # Extends the Java Problem class with some additional features.
4
- #
5
- class Problem
6
-
7
- # Support constructing a problem from arrays of double values.
8
- #
9
- # instances:: an array of instances, each instance being an array of doubles.
10
- # labels:: an array of doubles, forming the labels for each instance.
11
- #
12
- # An ArgumentError exception is raised if all the following conditions are not met:
13
- # * the number of instances should equal the number of labels,
14
- # * there must be at least one instance, and
15
- # * every instance must have the same number of features.
16
- #
17
- def Problem.from_array(instances, labels)
18
- unless instances.size == labels.size
19
- raise ArgumentError.new "Number of instances must equal number of labels"
20
- end
21
- unless instances.size > 0
22
- raise ArgumentError.new "There must be at least one instance."
23
- end
24
- unless instances.collect {|i| i.size}.min == instances.collect {|i| i.size}.max
25
- raise ArgumentError.new "All instances must have the same size"
26
- end
27
-
28
- problem = Problem.new
29
- problem.l = labels.size
30
- # -- add in the training data
31
- problem.x = Node[instances.size, instances[0].size].new
32
- instances.each_with_index do |instance, i|
33
- instance.each_with_index do |v, j|
34
- problem.x[i][j] = Node.new(j, v)
35
- end
36
- end
37
- # -- add in the labels
38
- problem.y = Java::double[labels.size].new
39
- labels.each_with_index do |v, i|
40
- problem.y[i] = v
41
- end
42
-
43
- return problem
44
- end
45
-
46
- # To select SvmLight input file format
47
- SvmLight = 0
48
-
49
- # To select Csv input file format
50
- Csv = 1
51
-
52
- # To select ARFF input file format
53
- Arff = 2
54
-
55
- #
56
- # Read in a problem definition from a file.
57
- #
58
- # filename:: the name of the file
59
- # format:: either Svm::SvmLight (default), Svm::Csv or Svm::Arff
60
- #
61
- # Raises ArgumentError if there is any error in format.
62
- #
63
- def Problem.from_file(filename, format = SvmLight)
64
- case format
65
- when SvmLight
66
- return Problem.from_file_svmlight filename
67
- when Csv
68
- return Problem.from_file_csv filename
69
- when Arff
70
- return Problem.from_file_arff filename
71
- end
72
- end
73
-
74
- #
75
- # Read in a problem definition in svmlight format.
76
- #
77
- # filename:: the name of the file
78
- #
79
- # Raises ArgumentError if there is any error in format.
80
- #
81
- def Problem.from_file_svmlight filename
82
- instances = []
83
- labels = []
84
- max_index = 0
85
- IO.foreach(filename) do |line|
86
- tokens = line.split(" ")
87
- labels << tokens[0].to_f
88
- instance = []
89
- tokens[1..-1].each do |feature|
90
- index, value = feature.split(":")
91
- instance << Node.new(index.to_i, value.to_f)
92
- max_index = [index.to_i, max_index].max
93
- end
94
- instances << instance
95
- end
96
- max_index += 1 # to allow for 0 position
97
- unless instances.size == labels.size
98
- raise ArgumentError.new "Number of labels read differs from number of instances"
99
- end
100
- # now create a Problem definition
101
- problem = Problem.new
102
- problem.l = instances.size
103
- # -- add in the training data
104
- problem.x = Node[instances.size, max_index].new
105
- # -- fill with blank nodes
106
- instances.size.times do |i|
107
- max_index.times do |j|
108
- problem.x[i][j] = Node.new(i, 0)
109
- end
110
- end
111
- # -- add known values
112
- instances.each_with_index do |instance, i|
113
- instance.each do |node|
114
- problem.x[i][node.index] = node
115
- end
116
- end
117
- # -- add in the labels
118
- problem.y = Java::double[labels.size].new
119
- labels.each_with_index do |v, i|
120
- problem.y[i] = v
121
- end
122
-
123
- return problem
124
- end
125
-
126
- #
127
- # Read in a problem definition in csv format.
128
- #
129
- # filename:: the name of the file
130
- #
131
- # Raises ArgumentError if there is any error in format.
132
- #
133
- def Problem.from_file_csv filename
134
- instances = []
135
- labels = []
136
- max_index = 0
137
- csv_data = CSV.parse(File.read(filename), headers: false)
138
- csv_data.each do |tokens|
139
- labels << tokens[0].to_f
140
- instance = []
141
- tokens[1..-1].each_with_index do |value, index|
142
- instance << Node.new(index, value.to_f)
143
- end
144
- max_index = [tokens.size, max_index].max
145
- instances << instance
146
- end
147
- max_index += 1 # to allow for 0 position
148
- unless instances.size == labels.size
149
- raise ArgumentError.new "Number of labels read differs from number of instances"
150
- end
151
- # now create a Problem definition
152
- problem = Problem.new
153
- problem.l = instances.size
154
- # -- add in the training data
155
- problem.x = Node[instances.size, max_index].new
156
- # -- fill with blank nodes
157
- instances.size.times do |i|
158
- max_index.times do |j|
159
- problem.x[i][j] = Node.new(i, 0)
160
- end
161
- end
162
- # -- add known values
163
- instances.each_with_index do |instance, i|
164
- instance.each do |node|
165
- problem.x[i][node.index] = node
166
- end
167
- end
168
- # -- add in the labels
169
- problem.y = Java::double[labels.size].new
170
- labels.each_with_index do |v, i|
171
- problem.y[i] = v
172
- end
173
-
174
- return problem
175
- end
176
-
177
- #
178
- # Read in a problem definition in arff format.
179
- # Assumes all values are numbers (non-numbers converted to 0.0),
180
- # and that the class is the last field.
181
- #
182
- # filename:: the name of the file
183
- #
184
- # Raises ArgumentError if there is any error in format.
185
- #
186
- def Problem.from_file_arff filename
187
- instances = []
188
- labels = []
189
- max_index = 0
190
- found_data = false
191
- IO.foreach(filename) do |line|
192
- unless found_data
193
- puts "Ignoring", line
194
- found_data = line.downcase.strip == "@data"
195
- next # repeat the loop
196
- end
197
- tokens = line.split(",")
198
- labels << tokens.last.to_f
199
- instance = []
200
- tokens[1...-1].each_with_index do |value, index|
201
- instance << Node.new(index, value.to_f)
202
- end
203
- max_index = [tokens.size, max_index].max
204
- instances << instance
205
- end
206
- max_index += 1 # to allow for 0 position
207
- unless instances.size == labels.size
208
- raise ArgumentError.new "Number of labels read differs from number of instances"
209
- end
210
- # now create a Problem definition
211
- problem = Problem.new
212
- problem.l = instances.size
213
- # -- add in the training data
214
- problem.x = Node[instances.size, max_index].new
215
- # -- fill with blank nodes
216
- instances.size.times do |i|
217
- max_index.times do |j|
218
- problem.x[i][j] = Node.new(i, 0)
219
- end
220
- end
221
- # -- add known values
222
- instances.each_with_index do |instance, i|
223
- instance.each do |node|
224
- problem.x[i][node.index] = node
225
- end
226
- end
227
- # -- add in the labels
228
- problem.y = Java::double[labels.size].new
229
- labels.each_with_index do |v, i|
230
- problem.y[i] = v
231
- end
232
-
233
- return problem
234
- end
235
-
236
- # Returns the number of instances
237
- def size
238
- self.l
239
- end
240
-
241
- # Rescale values within problem to be in range min_value to max_value
242
- #
243
- # For SVM models, it is recommended all features be in range [0,1] or [-1,1]
244
- def rescale(min_value = 0.0, max_value = 1.0)
245
- return if self.l.zero?
246
- x[0].size.times do |i|
247
- rescale_column(i, min_value, max_value)
248
- end
249
- end
250
-
251
- # Create a new problem by combining the instances in this problem with
252
- # those in the given problem.
253
- def merge problem
254
- unless self.x[0].size == problem.x[0].size
255
- raise ArgumentError.new "Cannot merge two problems with different numbers of features"
256
- end
257
- num_features = self.x[0].size
258
- num_instances = size + problem.size
259
-
260
- new_problem = Problem.new
261
- new_problem.l = num_instances
262
- new_problem.x = Node[num_instances, num_features].new
263
- new_problem.y = Java::double[num_instances].new
264
- # fill out the features
265
- num_instances.times do |i|
266
- num_features.times do |j|
267
- if i < size
268
- new_problem.x[i][j] = self.x[i][j]
269
- else
270
- new_problem.x[i][j] = problem.x[i-size][j]
271
- end
272
- end
273
- end
274
- # fill out the labels
275
- num_instances.times do |i|
276
- if i < size
277
- new_problem.y[i] = self.y[i]
278
- else
279
- new_problem.y[i] = problem.y[i-size]
280
- end
281
- end
282
-
283
- return new_problem
284
- end
285
-
286
- # Rescale values within problem for given column index,
287
- # to be in range min_value to max_value
288
- private
289
- def rescale_column(col, min_value, max_value)
290
- # -- first locate the column's range
291
- current_min = x[0][col].value
292
- current_max = x[0][col].value
293
- self.l.times do |index|
294
- if x[index][col].value < current_min
295
- current_min = x[index][col].value
296
- end
297
- if x[index][col].value > current_max
298
- current_max = x[index][col].value
299
- end
300
- end
301
- # -- then update each value
302
- self.l.times do |index|
303
- x[index][col].value = ((max_value - min_value) * (x[index][col].value - current_min) / (current_max - current_min)) + min_value
304
- end
305
- end
306
- end
307
- end
308
-
1
+ module SvmToolkit
2
+
3
+ # Holds a set of labelled data.
4
+ class Problem
5
+
6
+ # Support constructing a problem from arrays of numbers (floating-point values).
7
+ #
8
+ # * instances - an array of instances, each instance being an array of numbers.
9
+ # * labels - an array of numbers, forming the labels for each instance.
10
+ #
11
+ # An ArgumentError exception is raised if all the following conditions are not met:
12
+ # * the number of instances should equal the number of labels,
13
+ # * there must be at least one instance, and
14
+ # * every instance must have the same number of features.
15
+ #
16
+ def self.from_array(instances, labels)
17
+ unless instances.size == labels.size
18
+ raise ArgumentError.new "Number of instances must equal number of labels"
19
+ end
20
+ unless instances.size > 0
21
+ raise ArgumentError.new "There must be at least one instance."
22
+ end
23
+ unless instances.collect {|i| i.size}.min == instances.collect {|i| i.size}.max
24
+ raise ArgumentError.new "All instances must have the same size"
25
+ end
26
+
27
+ problem = Problem.new
28
+ problem.l = labels.size
29
+ # -- add in the training data
30
+ problem.x = Node[instances.size, instances[0].size].new
31
+ instances.each_with_index do |instance, i|
32
+ instance.each_with_index do |v, j|
33
+ problem.x[i][j] = Node.new(j, v)
34
+ end
35
+ end
36
+ # -- add in the labels
37
+ problem.y = Java::double[labels.size].new
38
+ labels.each_with_index do |v, i|
39
+ problem.y[i] = v
40
+ end
41
+
42
+ return problem
43
+ end
44
+
45
+ # To select SvmLight input file format
46
+ SvmLight = 0
47
+
48
+ # To select Csv input file format
49
+ Csv = 1
50
+
51
+ # To select ARFF input file format
52
+ Arff = 2
53
+
54
+ #
55
+ # Read in a problem definition from a given filename,
56
+ # using format SvmLight (default), Csv or Arff.
57
+ def self.from_file(filename, format = SvmLight)
58
+ case format
59
+ when SvmLight
60
+ return Problem.from_file_svmlight filename
61
+ when Csv
62
+ return Problem.from_file_csv filename
63
+ when Arff
64
+ return Problem.from_file_arff filename
65
+ end
66
+ end
67
+
68
+ # Read in a problem definition in svmlight format.
69
+ def self.from_file_svmlight filename
70
+ instances = []
71
+ labels = []
72
+ max_index = 0
73
+ IO.foreach(filename) do |line|
74
+ tokens = line.split(" ")
75
+ labels << tokens[0].to_f
76
+ instance = []
77
+ tokens[1..-1].each do |feature|
78
+ index, value = feature.split(":")
79
+ instance << Node.new(index.to_i, value.to_f)
80
+ max_index = [index.to_i, max_index].max
81
+ end
82
+ instances << instance
83
+ end
84
+ max_index += 1 # to allow for 0 position
85
+ unless instances.size == labels.size
86
+ raise ArgumentError.new "Number of labels read differs from number of instances"
87
+ end
88
+ # now create a Problem definition
89
+ problem = Problem.new
90
+ problem.l = instances.size
91
+ # -- add in the training data
92
+ problem.x = Node[instances.size, max_index].new
93
+ # -- fill with blank nodes
94
+ instances.size.times do |i|
95
+ max_index.times do |j|
96
+ problem.x[i][j] = Node.new(i, 0)
97
+ end
98
+ end
99
+ # -- add known values
100
+ instances.each_with_index do |instance, i|
101
+ instance.each do |node|
102
+ problem.x[i][node.index] = node
103
+ end
104
+ end
105
+ # -- add in the labels
106
+ problem.y = Java::double[labels.size].new
107
+ labels.each_with_index do |v, i|
108
+ problem.y[i] = v
109
+ end
110
+
111
+ return problem
112
+ end
113
+
114
+ # Read in a problem definition in csv format from given filename.
115
+ def self.from_file_csv filename
116
+ instances = []
117
+ labels = []
118
+ max_index = 0
119
+ csv_data = CSV.parse(File.read(filename), headers: false)
120
+ csv_data.each do |tokens|
121
+ labels << tokens[0].to_f
122
+ instance = []
123
+ tokens[1..-1].each_with_index do |value, index|
124
+ instance << Node.new(index, value.to_f)
125
+ end
126
+ max_index = [tokens.size, max_index].max
127
+ instances << instance
128
+ end
129
+ max_index += 1 # to allow for 0 position
130
+ unless instances.size == labels.size
131
+ raise ArgumentError.new "Number of labels read differs from number of instances"
132
+ end
133
+ # now create a Problem definition
134
+ problem = Problem.new
135
+ problem.l = instances.size
136
+ # -- add in the training data
137
+ problem.x = Node[instances.size, max_index].new
138
+ # -- fill with blank nodes
139
+ instances.size.times do |i|
140
+ max_index.times do |j|
141
+ problem.x[i][j] = Node.new(i, 0)
142
+ end
143
+ end
144
+ # -- add known values
145
+ instances.each_with_index do |instance, i|
146
+ instance.each do |node|
147
+ problem.x[i][node.index] = node
148
+ end
149
+ end
150
+ # -- add in the labels
151
+ problem.y = Java::double[labels.size].new
152
+ labels.each_with_index do |v, i|
153
+ problem.y[i] = v
154
+ end
155
+
156
+ return problem
157
+ end
158
+
159
+ # Read in a problem definition in arff format, from given filename.
160
+ # Assumes all values are numbers (non-numbers converted to 0.0),
161
+ # and that the class is the last field.
162
+ def self.from_file_arff filename
163
+ instances = []
164
+ labels = []
165
+ max_index = 0
166
+ found_data = false
167
+ IO.foreach(filename) do |line|
168
+ unless found_data
169
+ puts "Ignoring", line
170
+ found_data = line.downcase.strip == "@data"
171
+ next # repeat the loop
172
+ end
173
+ tokens = line.split(",")
174
+ labels << tokens.last.to_f
175
+ instance = []
176
+ tokens[1...-1].each_with_index do |value, index|
177
+ instance << Node.new(index, value.to_f)
178
+ end
179
+ max_index = [tokens.size, max_index].max
180
+ instances << instance
181
+ end
182
+ max_index += 1 # to allow for 0 position
183
+ unless instances.size == labels.size
184
+ raise ArgumentError.new "Number of labels read differs from number of instances"
185
+ end
186
+ # now create a Problem definition
187
+ problem = Problem.new
188
+ problem.l = instances.size
189
+ # -- add in the training data
190
+ problem.x = Node[instances.size, max_index].new
191
+ # -- fill with blank nodes
192
+ instances.size.times do |i|
193
+ max_index.times do |j|
194
+ problem.x[i][j] = Node.new(i, 0)
195
+ end
196
+ end
197
+ # -- add known values
198
+ instances.each_with_index do |instance, i|
199
+ instance.each do |node|
200
+ problem.x[i][node.index] = node
201
+ end
202
+ end
203
+ # -- add in the labels
204
+ problem.y = Java::double[labels.size].new
205
+ labels.each_with_index do |v, i|
206
+ problem.y[i] = v
207
+ end
208
+
209
+ return problem
210
+ end
211
+
212
+ # Returns the number of instances
213
+ def size
214
+ self.l
215
+ end
216
+
217
+ # Return label of nth instance
218
+ def label(n)
219
+ self.y[n]
220
+ end
221
+
222
+ # Return array of values for nth instance
223
+ def values(n)
224
+ self.x[n].collect { it.value }
225
+ end
226
+
227
+ # Rescale values within problem to be in range min_value to max_value
228
+ #
229
+ # For SVM models, it is recommended all features be in range [0,1] or [-1,1]
230
+ def rescale(min_value = 0.0, max_value = 1.0)
231
+ return if self.l.zero?
232
+ x[0].size.times do |i|
233
+ rescale_column(i, min_value, max_value)
234
+ end
235
+ end
236
+
237
+ # Create a new problem by combining the instances in this problem with
238
+ # those in the given problem.
239
+ def merge problem
240
+ unless self.x[0].size == problem.x[0].size
241
+ raise ArgumentError.new "Cannot merge two problems with different numbers of features"
242
+ end
243
+ num_features = self.x[0].size
244
+ num_instances = size + problem.size
245
+
246
+ new_problem = Problem.new
247
+ new_problem.l = num_instances
248
+ new_problem.x = Node[num_instances, num_features].new
249
+ new_problem.y = Java::double[num_instances].new
250
+ # fill out the features
251
+ num_instances.times do |i|
252
+ num_features.times do |j|
253
+ if i < size
254
+ new_problem.x[i][j] = self.x[i][j]
255
+ else
256
+ new_problem.x[i][j] = problem.x[i-size][j]
257
+ end
258
+ end
259
+ end
260
+ # fill out the labels
261
+ num_instances.times do |i|
262
+ if i < size
263
+ new_problem.y[i] = self.y[i]
264
+ else
265
+ new_problem.y[i] = problem.y[i-size]
266
+ end
267
+ end
268
+
269
+ return new_problem
270
+ end
271
+
272
+ # Rescale values within problem for given column index,
273
+ # to be in range min_value to max_value
274
+ private
275
+ def rescale_column(col, min_value, max_value)
276
+ # -- first locate the column's range
277
+ current_min = x[0][col].value
278
+ current_max = x[0][col].value
279
+ self.l.times do |index|
280
+ if x[index][col].value < current_min
281
+ current_min = x[index][col].value
282
+ end
283
+ if x[index][col].value > current_max
284
+ current_max = x[index][col].value
285
+ end
286
+ end
287
+ # -- then update each value
288
+ self.l.times do |index|
289
+ x[index][col].value = ((max_value - min_value) * (x[index][col].value - current_min) / (current_max - current_min)) + min_value
290
+ end
291
+ end
292
+ end
293
+ end
294
+