svmredlight 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +1 -0
- data/README.rdoc +6 -0
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/ext/svmredlight.c +142 -126
- data/lib/svmredlight.rb +2 -2
- data/lib/svmredlight/document.rb +2 -4
- data/lib/svmredlight/model.rb +44 -1
- data/svmredlight.gemspec +2 -2
- data/test/helper.rb +1 -0
- data/test/test_document.rb +47 -41
- data/test/test_model.rb +120 -75
- metadata +3 -3
data/.document
CHANGED
data/README.rdoc
CHANGED
@@ -7,9 +7,15 @@ A partial interface to SVM-light [http://svmlight.joachims.org/] using it you ca
|
|
7
7
|
|
8
8
|
As of now it's know to work with SVM 6.02.
|
9
9
|
|
10
|
+
=== Installing svmlight as a library
|
11
|
+
|
10
12
|
Make sure to build the libsvmlight.o version of svmlight by using
|
11
13
|
"make libsvmlight_hideo".
|
12
14
|
|
15
|
+
Make sure the .h files in the svmlight distribution are in your include path, inside a subdirectory called svm_light,
|
16
|
+
and the object code for the library is in your include path (/usr/lib for instance).
|
17
|
+
|
18
|
+
|
13
19
|
|
14
20
|
== Document
|
15
21
|
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ require 'rake'
|
|
14
14
|
require 'jeweler'
|
15
15
|
Jeweler::Tasks.new do |gem|
|
16
16
|
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
-
gem.version = '0.1.
|
17
|
+
gem.version = '0.1.1'
|
18
18
|
gem.name = "svmredlight"
|
19
19
|
gem.homepage = "http://github.com/camilo/svmredlight"
|
20
20
|
gem.license = "MIT"
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
data/ext/svmredlight.c
CHANGED
@@ -9,13 +9,10 @@ is_linear(MODEL *model){
|
|
9
9
|
return model->kernel_parm.kernel_type == 0;
|
10
10
|
}
|
11
11
|
|
12
|
-
// Modules and Classes
|
13
12
|
static VALUE rb_mSvmLight;
|
14
13
|
static VALUE rb_cModel;
|
15
14
|
static VALUE rb_cDocument;
|
16
15
|
|
17
|
-
// GC functions
|
18
|
-
|
19
16
|
/* Not using deep free anymore, let ruby call free on the documents otherwise we might end
|
20
17
|
* up having double free problems, from svm_learn_main: Warning: The model contains
|
21
18
|
* references to the original data 'docs'. If you want to free the original data, and
|
@@ -52,11 +49,12 @@ model_read_from_file(VALUE klass, VALUE filename){
|
|
52
49
|
/* Helper function type checks a string meant to be used as a learn_parm, in case of error
|
53
50
|
* returns 1 and sets the correct exception message in error, on success returns 0 and
|
54
51
|
* copies the c string data of new_val to target*/
|
55
|
-
int
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
52
|
+
int
|
53
|
+
check_string_param(VALUE new_val,
|
54
|
+
const char *default_val,
|
55
|
+
char *target,
|
56
|
+
const char *name,
|
57
|
+
char *error){
|
60
58
|
|
61
59
|
if(TYPE(new_val) == T_STRING){
|
62
60
|
strlcpy(target, StringValuePtr(new_val), 199);
|
@@ -73,7 +71,8 @@ int check_string_param(VALUE new_val,
|
|
73
71
|
/* Helper function type checks a long meant to be used as a learn_parm or kernel_parm, in
|
74
72
|
* case of error returns 1 and sets the correct exception message in error, on success
|
75
73
|
* returns 0 and copies the c string data of new_val to target*/
|
76
|
-
int
|
74
|
+
int
|
75
|
+
check_long_param(VALUE new_val,
|
77
76
|
long default_val,
|
78
77
|
long *target,
|
79
78
|
const char *name,
|
@@ -93,11 +92,12 @@ int check_long_param(VALUE new_val,
|
|
93
92
|
/* Helper function type checks a double meant to be used as a learn_parm or kernel_parm, in
|
94
93
|
* case of error returns 1 and sets the correct exception message in error, on success
|
95
94
|
* returns 0 and copies the c string data of new_val to target*/
|
96
|
-
int
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
95
|
+
int
|
96
|
+
check_double_param(VALUE new_val,
|
97
|
+
double default_val,
|
98
|
+
double *target,
|
99
|
+
const char *name,
|
100
|
+
char *error){
|
101
101
|
if(TYPE(new_val) == T_FLOAT || TYPE(new_val) == T_FIXNUM){
|
102
102
|
*target = NUM2DBL(new_val);
|
103
103
|
}else if(NIL_P(new_val) ){
|
@@ -113,11 +113,12 @@ int check_double_param(VALUE new_val,
|
|
113
113
|
/* Helper function type checks an int meant to be used as a boolean learn_parm or
|
114
114
|
* kernel_parm, in case of error returns 1 and sets the correct exception message in
|
115
115
|
* error, on success returns 0 and copies the c string data of new_val to target*/
|
116
|
-
int
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
116
|
+
int
|
117
|
+
check_bool_param(VALUE new_val,
|
118
|
+
long default_val,
|
119
|
+
long *target,
|
120
|
+
const char *name,
|
121
|
+
char *error){
|
121
122
|
if(TYPE(new_val) == T_TRUE){
|
122
123
|
*target = 1L;
|
123
124
|
}else if(TYPE(new_val) == T_FALSE){
|
@@ -134,208 +135,209 @@ int check_bool_param(VALUE new_val,
|
|
134
135
|
|
135
136
|
/* Helper function in charge of setting up the learn parameters before they are passed to
|
136
137
|
* the svm_learn_classification copies part of the logic in svm_learn_main.c */
|
137
|
-
int
|
138
|
+
int
|
139
|
+
setup_learn_params(LEARN_PARM *c_learn_param, VALUE r_hash, char *error_message){
|
138
140
|
// Defaults taken from from svm_learn_main
|
139
141
|
VALUE inter_val, temp_ary, svm_type, svm_type_ruby_str;
|
140
142
|
char *svm_type_str;
|
141
143
|
|
142
144
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("predfile"));
|
143
145
|
if(1 == check_string_param(inter_val,
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
146
|
+
"trans_predictions",
|
147
|
+
(char *)&c_learn_param->predfile,
|
148
|
+
"predfile",
|
149
|
+
error_message)){
|
148
150
|
return 1;
|
149
151
|
}
|
150
152
|
|
151
153
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("alphafile"));
|
152
154
|
if(1 == check_string_param(inter_val,
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
155
|
+
"",
|
156
|
+
(char*)&c_learn_param->alphafile,
|
157
|
+
"alphafile",
|
158
|
+
error_message)){
|
157
159
|
return 1;
|
158
160
|
}
|
159
161
|
|
160
162
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("biased_hyperplane"));
|
161
163
|
if(1 == check_bool_param(inter_val,
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
164
|
+
1L,
|
165
|
+
&(c_learn_param->biased_hyperplane),
|
166
|
+
"biased_hyperplane",
|
167
|
+
error_message)){
|
166
168
|
return 1;
|
167
169
|
}
|
168
170
|
|
169
171
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("sharedslack"));
|
170
172
|
if(1 == check_bool_param(inter_val,
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
173
|
+
0L,
|
174
|
+
&(c_learn_param->sharedslack),
|
175
|
+
"sharedslack",
|
176
|
+
error_message)){
|
175
177
|
return 1;
|
176
178
|
}
|
177
179
|
|
178
180
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("remove_inconsistent"));
|
179
181
|
if(1 == check_bool_param(inter_val,
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
182
|
+
0L,
|
183
|
+
&(c_learn_param->remove_inconsistent),
|
184
|
+
"remove_inconsistent",
|
185
|
+
error_message)){
|
184
186
|
return 1;
|
185
187
|
}
|
186
188
|
|
187
189
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("skip_final_opt_check"));
|
188
190
|
if(1 == check_bool_param(inter_val,
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
191
|
+
0L,
|
192
|
+
&(c_learn_param->skip_final_opt_check),
|
193
|
+
"skip_final_opt_check",
|
194
|
+
error_message)){
|
193
195
|
return 1;
|
194
196
|
}
|
195
197
|
|
196
198
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_newvarsinqp"));
|
197
199
|
if(1 == check_bool_param(inter_val,
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
200
|
+
0L,
|
201
|
+
&(c_learn_param->svm_newvarsinqp),
|
202
|
+
"svm_newvarsinqp",
|
203
|
+
error_message)){
|
202
204
|
return 1;
|
203
205
|
}
|
204
206
|
|
205
207
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("compute_loo"));
|
206
208
|
if(1 == check_bool_param(inter_val,
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
209
|
+
0L,
|
210
|
+
&(c_learn_param->compute_loo),
|
211
|
+
"compute_loo",
|
212
|
+
error_message)){
|
211
213
|
return 1;
|
212
214
|
}
|
213
215
|
|
214
216
|
|
215
217
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_maxqpsize"));
|
216
218
|
if(1 == check_long_param(inter_val,
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
219
|
+
10L,
|
220
|
+
&(c_learn_param->svm_maxqpsize),
|
221
|
+
"svm_maxqpsize",
|
222
|
+
error_message)){
|
221
223
|
return 1;
|
222
224
|
}
|
223
225
|
|
224
226
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_iter_to_shrink"));
|
225
227
|
if(1 == check_long_param(inter_val,
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
228
|
+
-9999,
|
229
|
+
&(c_learn_param->svm_iter_to_shrink),
|
230
|
+
"svm_iter_to_shrink",
|
231
|
+
error_message)){
|
230
232
|
return 1;
|
231
233
|
}
|
232
234
|
|
233
235
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("maxiter"));
|
234
236
|
if(1 == check_long_param(inter_val,
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
237
|
+
100000,
|
238
|
+
&(c_learn_param->maxiter),
|
239
|
+
"maxiter",
|
240
|
+
error_message)){
|
239
241
|
return 1;
|
240
242
|
}
|
241
243
|
|
242
244
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("kernel_cache_size"));
|
243
245
|
if(1 == check_long_param(inter_val,
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
246
|
+
40L,
|
247
|
+
&(c_learn_param->kernel_cache_size),
|
248
|
+
"kernel_cache_size",
|
249
|
+
error_message)){
|
248
250
|
return 1;
|
249
251
|
}
|
250
252
|
|
251
253
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("xa_depth"));
|
252
254
|
if(1 == check_long_param(inter_val,
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
255
|
+
0L,
|
256
|
+
&(c_learn_param->xa_depth),
|
257
|
+
"xa_depth",
|
258
|
+
error_message)){
|
257
259
|
return 1;
|
258
260
|
}
|
259
261
|
|
260
262
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_c"));
|
261
263
|
if(1 == check_double_param(inter_val,
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
264
|
+
0.0,
|
265
|
+
&(c_learn_param->svm_c),
|
266
|
+
"svm_c",
|
267
|
+
error_message)){
|
266
268
|
return 1;
|
267
269
|
}
|
268
270
|
|
269
271
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("eps"));
|
270
272
|
if(1 == check_double_param(inter_val,
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
273
|
+
0.1,
|
274
|
+
&(c_learn_param->eps),
|
275
|
+
"eps",
|
276
|
+
error_message)){
|
275
277
|
return 1;
|
276
278
|
}
|
277
279
|
|
278
280
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("transduction_posratio"));
|
279
281
|
if(1 == check_double_param(inter_val,
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
282
|
+
-1.0,
|
283
|
+
&(c_learn_param->transduction_posratio),
|
284
|
+
"transduction_posratio",
|
285
|
+
error_message)){
|
284
286
|
return 1;
|
285
287
|
}
|
286
288
|
|
287
289
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_costratio"));
|
288
290
|
if(1 == check_double_param(inter_val,
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
291
|
+
1.0,
|
292
|
+
&(c_learn_param->svm_costratio),
|
293
|
+
"svm_costratio",
|
294
|
+
error_message)){
|
293
295
|
return 1;
|
294
296
|
}
|
295
297
|
|
296
298
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_costratio_unlab"));
|
297
299
|
if(1 == check_double_param(inter_val,
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
300
|
+
1.0,
|
301
|
+
&(c_learn_param->svm_costratio_unlab),
|
302
|
+
"svm_costratio_unlab",
|
303
|
+
error_message)){
|
302
304
|
return 1;
|
303
305
|
}
|
304
306
|
|
305
307
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_unlabbound"));
|
306
308
|
if(1 == check_double_param(inter_val,
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
309
|
+
1.0000000000000001e-05,
|
310
|
+
&(c_learn_param->svm_unlabbound),
|
311
|
+
"svm_unlabbound",
|
312
|
+
error_message)){
|
311
313
|
return 1;
|
312
314
|
}
|
313
315
|
|
314
316
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("epsilon_crit"));
|
315
317
|
if(1 == check_double_param(inter_val,
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
318
|
+
0.001,
|
319
|
+
&(c_learn_param->epsilon_crit),
|
320
|
+
"epsilon_crit",
|
321
|
+
error_message)){
|
320
322
|
return 1;
|
321
323
|
}
|
322
324
|
|
323
325
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("epsilon_a"));
|
324
326
|
if(1 == check_double_param(inter_val,
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
327
|
+
1E-15,
|
328
|
+
&(c_learn_param->epsilon_a),
|
329
|
+
"epsilon_a",
|
330
|
+
error_message)){
|
329
331
|
return 1;
|
330
332
|
}
|
331
333
|
|
332
334
|
c_learn_param->rho=1.0;
|
333
335
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("rho"));
|
334
336
|
if(1 == check_double_param(inter_val,
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
337
|
+
1.0,
|
338
|
+
&(c_learn_param->rho),
|
339
|
+
"rho",
|
340
|
+
error_message)){
|
339
341
|
return 1;
|
340
342
|
}
|
341
343
|
|
@@ -343,41 +345,42 @@ int setup_learn_params(LEARN_PARM *c_learn_param, VALUE r_hash, char *error_mess
|
|
343
345
|
return 0;
|
344
346
|
}
|
345
347
|
|
346
|
-
int
|
348
|
+
int
|
349
|
+
setup_kernel_params(KERNEL_PARM *c_kernel_param, VALUE r_hash, char *error_message){
|
347
350
|
VALUE inter_val;
|
348
351
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("poly_degree"));
|
349
352
|
if(1 == check_long_param(inter_val,
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
353
|
+
3L,
|
354
|
+
&(c_kernel_param->poly_degree),
|
355
|
+
"poly_degree",
|
356
|
+
error_message)){
|
354
357
|
return 1;
|
355
358
|
}
|
356
359
|
|
357
360
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("rbf_gamma"));
|
358
361
|
if(1 == check_double_param(inter_val,
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
362
|
+
1.0,
|
363
|
+
&(c_kernel_param->rbf_gamma),
|
364
|
+
"rbf_gamma",
|
365
|
+
error_message)){
|
363
366
|
return 1;
|
364
367
|
}
|
365
368
|
|
366
369
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("coef_lin"));
|
367
370
|
if(1 == check_double_param(inter_val,
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
371
|
+
1.0,
|
372
|
+
&(c_kernel_param->coef_lin),
|
373
|
+
"coef_lin",
|
374
|
+
error_message)){
|
372
375
|
return 1;
|
373
376
|
}
|
374
377
|
|
375
378
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("coef_const"));
|
376
379
|
if(1 == check_double_param(inter_val,
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
380
|
+
1.0,
|
381
|
+
&(c_kernel_param->coef_const),
|
382
|
+
"coef_const",
|
383
|
+
error_message)){
|
381
384
|
return 1;
|
382
385
|
}
|
383
386
|
|
@@ -634,6 +637,18 @@ model_support_vectors_count(VALUE self){
|
|
634
637
|
return INT2FIX(m->sv_num);
|
635
638
|
}
|
636
639
|
|
640
|
+
static VALUE
|
641
|
+
model_write_to_file(VALUE self, VALUE pahtofile){
|
642
|
+
Check_Type(pahtofile, T_STRING);
|
643
|
+
|
644
|
+
MODEL *m;
|
645
|
+
Data_Get_Struct(self, MODEL, m);
|
646
|
+
|
647
|
+
write_model(StringValuePtr(pahtofile), m);
|
648
|
+
|
649
|
+
return Qnil;
|
650
|
+
}
|
651
|
+
|
637
652
|
static VALUE
|
638
653
|
model_total_words(VALUE self){
|
639
654
|
MODEL *m;
|
@@ -745,8 +760,9 @@ Init_svmredlight(){
|
|
745
760
|
rb_mSvmLight = rb_define_module("SVMLight");
|
746
761
|
//Model
|
747
762
|
rb_cModel = rb_define_class_under(rb_mSvmLight, "Model", rb_cObject);
|
748
|
-
rb_define_singleton_method(rb_cModel, "
|
763
|
+
rb_define_singleton_method(rb_cModel, "from_file", model_read_from_file, 1);
|
749
764
|
rb_define_singleton_method(rb_cModel, "learn_classification", model_learn_classification, 5);
|
765
|
+
rb_define_method(rb_cModel, "to_file", model_write_to_file, 1);
|
750
766
|
rb_define_method(rb_cModel, "support_vectors_count", model_support_vectors_count, 0);
|
751
767
|
rb_define_method(rb_cModel, "total_words", model_total_words, 0);
|
752
768
|
rb_define_method(rb_cModel, "classify", model_classify_example, 1);
|
data/lib/svmredlight.rb
CHANGED
data/lib/svmredlight/document.rb
CHANGED
@@ -1,9 +1,7 @@
|
|
1
1
|
module SVMLight
|
2
|
-
# A document is the Ruby representation of a DOC structure in SVMlight, it contains a
|
3
|
-
#
|
4
|
-
# correspondent weights.
|
2
|
+
# A document is the Ruby representation of a DOC structure in SVMlight, it contains a queryid, a slackid, a costfactor
|
3
|
+
# ( c ) and a vector with feature numbers and their correspondent weights.
|
5
4
|
class Document
|
6
|
-
|
7
5
|
# @param [Hash] vector a hash where the keys are feature numbers and the values its weights
|
8
6
|
# @param [Hash] opts the options coincide with SVMLight parameters to the create_example function, the default values for all the options are 0
|
9
7
|
# @option [:docnum] Numeric docum
|
data/lib/svmredlight/model.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
module SVMLight
|
2
|
+
|
3
|
+
class MissingModelFile < StandardError; end
|
2
4
|
# A model is the product of training a SVM, once created it can take documents as inputs
|
3
5
|
# and act of them (by for instance classifying them). Models can also be read from files
|
4
6
|
# created by svm_learn.
|
@@ -16,7 +18,48 @@ module SVMLight
|
|
16
18
|
|
17
19
|
learn_classification(documents_and_lables, learn_params, kernel_params, false, alphas)
|
18
20
|
end
|
19
|
-
|
21
|
+
|
20
22
|
private_class_method :learn_classification
|
23
|
+
private_class_method :from_file
|
24
|
+
|
25
|
+
# in self.read_from_file and #write_to_file
|
26
|
+
#
|
27
|
+
# This is an anti-pattern. Checking for existence of resources is normally something to be avoided. Trying to open
|
28
|
+
# the resource and then rescuing the exception/reading the error code is a much better practice, however SVMLight
|
29
|
+
# will call exit(1) if the file does not exists, and, that in turn will kill the ruby VM, so in this case to
|
30
|
+
# minimize that possibility I'm optimistically check for the file existence and hope it is still there when it is
|
31
|
+
# actually time to open it.
|
32
|
+
#
|
33
|
+
# TODO: Come up with a proper replacement for those methods, probably simply reimplementing them in svmredlight.c
|
34
|
+
# and raising an exception when files cannot be open.
|
35
|
+
|
36
|
+
# Will load an existent model from a file
|
37
|
+
# @param [String] pahtofile path to the model file
|
38
|
+
def self.read_from_file(pahtofile)
|
39
|
+
if File.exists?(pahtofile) && File.file?(pahtofile)
|
40
|
+
from_file(pahtofile)
|
41
|
+
|
42
|
+
else
|
43
|
+
|
44
|
+
raise MissingModelFile, "the #{pahtofile} does not exists or is not a file"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
private :to_file
|
49
|
+
|
50
|
+
# Will create a file containing the model info, the model info can be turn back into a model by using
|
51
|
+
# Model.read_from_file
|
52
|
+
# @param [String] pahtofile
|
53
|
+
def write_to_file(pahtofile)
|
54
|
+
dir = File.dirname(pahtofile)
|
55
|
+
|
56
|
+
if File.directory?(dir) && File.writable?(dir)
|
57
|
+
to_file(pahtofile)
|
58
|
+
|
59
|
+
else
|
60
|
+
raise ModelWriteError, "impossible to write #{pahtofile}"
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
21
64
|
end
|
22
65
|
end
|
data/svmredlight.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{svmredlight}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Camilo Lopez"]
|
12
|
-
s.date = %q{2011-09-
|
12
|
+
s.date = %q{2011-09-22}
|
13
13
|
s.description = %q{Ruby interface to SVMLight}
|
14
14
|
s.email = %q{camilo@camilolopez.com}
|
15
15
|
s.extensions = ["ext/extconf.rb"]
|
data/test/helper.rb
CHANGED
data/test/test_document.rb
CHANGED
@@ -3,53 +3,59 @@ include SVMLight
|
|
3
3
|
|
4
4
|
class TestDocument < Test::Unit::TestCase
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
6
|
+
context "creating a new document" do
|
7
|
+
|
8
|
+
should "succed when using #create" do
|
9
|
+
d = Document.create(0, 0.5, 1, 0, [[1, 1.0 ], [4, 0.0] , [10, 0.0] ,[ 11, 0.5 ]])
|
10
|
+
assert_kind_of Document, d
|
11
|
+
end
|
12
|
+
|
13
|
+
should "accept integers as feature weights" do
|
14
|
+
d = Document.create(0, 0.5, 1, 0, [[1, 0 ], [4, 0.0] , [10, 0.0] ,[ 11, 0.5 ]])
|
15
|
+
assert_kind_of Document, d
|
16
|
+
end
|
17
|
+
|
18
|
+
should "create documents useing new as well" do
|
19
|
+
d = Document.new({1 => 566.0, 4 => 133.0}, {docnum: 10, slackid: 1, queryid: 2, costfactor: 0.5})
|
20
|
+
|
21
|
+
assert_equal 10, d.docnum
|
22
|
+
assert_equal 1, d.slackid
|
23
|
+
assert_equal 2, d.queryid
|
24
|
+
assert_equal 0.5, d.costfactor
|
25
|
+
end
|
26
|
+
|
27
|
+
should "raise argument error if any of the word numbers is less or equal to 0" do
|
28
|
+
assert_raise(ArgumentError){ Document.create(0, 0.5, 1, 0, [[0, 1.0 ], [10, 0.0 ], [20, 0.0], [21, 0.1 ]]) }
|
29
|
+
assert_raise(ArgumentError){ Document.create(1, 0.5, 1, 0, [[-1, 1.0 ], [30, 0.0 ], [40, 0.0], [41, 0.1 ]])}
|
30
|
+
end
|
31
|
+
|
32
|
+
should "raise type error when the fourth argument is not an array" do
|
33
|
+
assert_raise(TypeError) { Document.create(-1, 0, 1, 0, {}) }
|
34
|
+
end
|
35
|
+
|
36
|
+
should "raise type error when the fourth argument is empty" do
|
37
|
+
assert_raise(ArgumentError) { Document.create(-1, 0, 1, 0 [])}
|
38
|
+
end
|
15
39
|
|
16
|
-
def test_create_using_new
|
17
|
-
d = Document.new({1 => 566.0, 4 => 133.0}, {docnum: 10, slackid: 1, queryid: 2, costfactor: 0.5})
|
18
|
-
|
19
|
-
assert_equal 10, d.docnum
|
20
|
-
assert_equal 1, d.slackid
|
21
|
-
assert_equal 2, d.queryid
|
22
|
-
assert_equal 0.5, d.costfactor
|
23
40
|
end
|
41
|
+
|
42
|
+
context 'a document' do
|
43
|
+
should "have accessible docnum, queryid, slackid, and, costfacor" do
|
44
|
+
d1 = Document.create(0, 0.5, 1, 0, [[1, 1.0 ], [10, 0.0 ], [20, 0.0], [21, 0.1 ]])
|
45
|
+
d2 = Document.create(1, 0.6, 2, 1, [[1, 1.0 ], [30, 0.0 ], [40, 0.0], [41, 0.1 ]])
|
24
46
|
|
25
|
-
|
26
|
-
|
27
|
-
d2 = Document.create(1, 0.6, 2, 1, [[1, 1.0 ], [30, 0.0 ], [40, 0.0], [41, 0.1 ]])
|
28
|
-
|
29
|
-
assert_equal 0, d1.docnum
|
30
|
-
assert_equal 1, d2.docnum
|
31
|
-
|
32
|
-
assert_equal 1, d1.slackid
|
33
|
-
assert_equal 2, d2.slackid
|
47
|
+
assert_equal 0, d1.docnum
|
48
|
+
assert_equal 1, d2.docnum
|
34
49
|
|
35
|
-
|
36
|
-
|
50
|
+
assert_equal 1, d1.slackid
|
51
|
+
assert_equal 2, d2.slackid
|
37
52
|
|
38
|
-
|
39
|
-
|
40
|
-
end
|
41
|
-
|
42
|
-
def test_all_word_numbers_should_be_greater_than_zero
|
43
|
-
assert_raise(ArgumentError){ Document.create(0, 0.5, 1, 0, [[0, 1.0 ], [10, 0.0 ], [20, 0.0], [21, 0.1 ]]) }
|
44
|
-
assert_raise(ArgumentError){ Document.create(1, 0.5, 1, 0, [[-1, 1.0 ], [30, 0.0 ], [40, 0.0], [41, 0.1 ]])}
|
45
|
-
end
|
46
|
-
|
47
|
-
def test_create_with_no_array
|
48
|
-
assert_raise(TypeError) { Document.create(-1, 0, 1, 0, {}) }
|
49
|
-
end
|
53
|
+
assert_equal 0, d1.queryid
|
54
|
+
assert_equal 1, d2.queryid
|
50
55
|
|
51
|
-
|
52
|
-
|
56
|
+
assert_equal 0.5, d1.costfactor
|
57
|
+
assert_equal 0.6, d2.costfactor
|
58
|
+
end
|
53
59
|
end
|
54
60
|
end
|
55
61
|
|
data/test/test_model.rb
CHANGED
@@ -3,112 +3,157 @@ include SVMLight
|
|
3
3
|
|
4
4
|
class TestModel < Test::Unit::TestCase
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
6
|
+
context "reading a model from file" do
|
7
|
+
|
8
|
+
setup do
|
9
|
+
@file_name = 'test/assets/model'
|
10
|
+
end
|
11
|
+
|
12
|
+
should "read properly from a well formed file" do
|
13
|
+
assert m = Model.read_from_file(@file_name)
|
14
|
+
assert_equal 3877, m.support_vectors_count
|
15
|
+
assert_equal 39118, m.total_words
|
16
|
+
end
|
16
17
|
|
17
|
-
|
18
|
-
|
19
|
-
assert_kind_of Model, m
|
18
|
+
should "classify successfully after reading the model from a file" do
|
19
|
+
m = Model.read_from_file(@file_name)
|
20
20
|
|
21
|
-
|
22
|
-
assert_kind_of
|
21
|
+
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1.0, 0, 0, 0, 0.5 ].each_with_index.map{|v, i| [i + 1 ,v.to_f]} ) )
|
22
|
+
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1.0, 0, 0, 0, 0.5 ].each_with_index.map{|v, i| [i + 1,v.to_f]}) )
|
23
|
+
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1, 0, 0, 0, 0.8, 0, 0 , 0 ].each_with_index.map{|v, i| [i + 1,v.to_f]}) )
|
24
|
+
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1, 0.5, 0, 0, 0, 0 , 0 ].each_with_index.map{|v, i| [i + 1,v.to_f]}) )
|
25
|
+
end
|
26
|
+
|
27
|
+
should "raise file not found exception when file does not exists" do
|
28
|
+
assert_raises(MissingModelFile){ Model.read_from_file(@file_name + 'bleh') }
|
23
29
|
end
|
24
30
|
end
|
25
31
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
32
|
+
context "writting a model to a file" do
|
33
|
+
setup do
|
34
|
+
@features ||= [
|
35
|
+
[ [1,0.6], [11, 0.0], [34, 0.1] ],
|
36
|
+
[ [5,0.4], [15, 0.0], [30, 0.1] ],
|
37
|
+
[ [1,0.1], [13, 0.0], [31, 0.1] ],
|
38
|
+
[ [7,0.7], [15, 0.0], [35, 0.1] ],
|
39
|
+
[ [5,0.6], [19, 0.0], [44, 0.1] ],
|
40
|
+
]
|
41
|
+
|
42
|
+
@docs_and_labels ||= @features.each_with_index.map do |feature, index|
|
43
|
+
[ Document.create(index + 1, 1, 0, 0, feature), index%2 * -1]
|
44
|
+
end
|
45
|
+
|
46
|
+
@filepath = './test/assets/written_model'
|
47
|
+
@model = Model.new(:classification, @docs_and_labels, {}, {}, nil)
|
48
|
+
end
|
49
|
+
|
50
|
+
should "write a model from memmory to a file" do
|
51
|
+
@model.write_to_file(@filepath)
|
31
52
|
|
32
|
-
|
33
|
-
|
53
|
+
assert File.exists?(@filepath)
|
54
|
+
assert File.file?(@filepath)
|
55
|
+
# TODO: Implement actual model equality
|
56
|
+
assert_equal @model.support_vectors_count, Model.read_from_file(@filepath).support_vectors_count
|
34
57
|
end
|
35
58
|
|
59
|
+
# Need to find a good way to test this without relaying too much in the environment
|
60
|
+
should "raise ModelWriteError when it is impossible to write a model file"
|
61
|
+
|
62
|
+
teardown do
|
63
|
+
`rm #{@filepath} &> /dev/null`
|
64
|
+
end
|
36
65
|
end
|
37
66
|
|
38
|
-
|
39
|
-
|
40
|
-
|
67
|
+
context "when learning from new documents" do
|
68
|
+
|
69
|
+
setup do
|
70
|
+
@features ||= [
|
71
|
+
[ [1,0.6], [11, 0.0], [34, 0.1] ],
|
72
|
+
[ [5,0.4], [15, 0.0], [30, 0.1] ],
|
73
|
+
[ [1,0.1], [13, 0.0], [31, 0.1] ],
|
74
|
+
[ [7,0.7], [15, 0.0], [35, 0.1] ],
|
75
|
+
[ [5,0.6], [19, 0.0], [44, 0.1] ],
|
76
|
+
]
|
77
|
+
|
78
|
+
@docs_and_labels ||= @features.each_with_index.map do |feature, index|
|
79
|
+
[ Document.create(index + 1, 1, 0, 0, feature), index%2 * -1]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
should "learn classification with default arguments" do
|
84
|
+
m = Model.new(:classification, @docs_and_labels, {}, {}, nil)
|
85
|
+
assert_kind_of Model, m
|
86
|
+
assert_equal 44, m.total_words
|
87
|
+
assert_equal 5, m.totdoc
|
88
|
+
|
89
|
+
@docs_and_labels.each_with_index do |item, i|
|
90
|
+
assert_kind_of Numeric, m.classify(item.first), "failed in item # #{i}"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
should "learn classification with alpha values" do
|
95
|
+
m = Model.new(:classification, @docs_and_labels, {}, {}, [1, 0.0] * 50)
|
96
|
+
assert_kind_of Model, m
|
97
|
+
|
98
|
+
@docs_and_labels.each_with_index do |item, i|
|
99
|
+
assert_kind_of Numeric, m.classify(item.first), "failed in item # #{i}"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
should "raise argument error when one of the alphas is not numeric " do
|
104
|
+
assert_raises(ArgumentError){Model.new(:classification, @docs_and_labels, {}, {}, [1, {}] )}
|
105
|
+
end
|
106
|
+
|
107
|
+
should "learn classification and accept learn parameters" do
|
108
|
+
learn_params = {
|
41
109
|
"predfile" => "custom_file",
|
42
110
|
"alphafile" => "alpha",
|
43
111
|
"biased_hyperplane" => false,
|
44
112
|
"sharedslack" => false,
|
45
113
|
"remove_inconsistent" => true
|
46
|
-
|
114
|
+
}
|
47
115
|
|
48
|
-
|
49
|
-
|
116
|
+
m = Model.new(:classification, @docs_and_labels, learn_params, {}, nil)
|
117
|
+
assert_kind_of Model, m
|
50
118
|
|
51
|
-
|
52
|
-
|
119
|
+
@docs_and_labels.each_with_index do |item, i|
|
120
|
+
assert_kind_of Numeric, m.classify(item.first), "failed in item # #{i}"
|
121
|
+
end
|
53
122
|
end
|
54
|
-
end
|
55
123
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
124
|
+
should "raise argument error when learn parameters are invalid" do
|
125
|
+
learn_params = {"svm_c" => -1}
|
126
|
+
assert_raises(ArgumentError){Model.new(:classification, @docs_and_labels, learn_params, {}, nil)}
|
127
|
+
learn_params = {"svm_iter_to_shrink" => -1}
|
128
|
+
assert_raises(ArgumentError){Model.new(:classification, @docs_and_labels, learn_params, {}, nil)}
|
129
|
+
end
|
130
|
+
|
131
|
+
should "learn calssification while accepting kernel paramters" do
|
62
132
|
|
63
|
-
|
64
|
-
|
65
|
-
kernel_params = {
|
133
|
+
kernel_params = {
|
66
134
|
"poly_degree" => 3,
|
67
135
|
"rbf_gamma" => 0.5,
|
68
136
|
"coef_lin" => 0.4,
|
69
137
|
"coef_const" => 0.56
|
70
|
-
|
138
|
+
}
|
71
139
|
|
72
|
-
|
73
|
-
|
140
|
+
m = Model.new(:classification, @docs_and_labels, {}, kernel_params, nil)
|
141
|
+
assert_kind_of Model, m
|
74
142
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
end
|
79
|
-
|
80
|
-
def test_learn_classification_with_learn_params_when_predfile_is_not_string
|
81
|
-
|
82
|
-
learn_params = { "predfile" => {}}
|
83
|
-
|
84
|
-
assert_raise(ArgumentError) do
|
85
|
-
Model.new(:classification, @docs_and_labels, learn_params, {}, [1, 0.0, 1])
|
143
|
+
@docs_and_labels.each_with_index do |item, i|
|
144
|
+
assert_kind_of Numeric, m.classify(item.first), "failed in item # #{i}"
|
145
|
+
end
|
86
146
|
end
|
87
147
|
|
88
|
-
|
148
|
+
should "raise argument error when predfile is not string" do
|
89
149
|
|
90
|
-
|
91
|
-
@docs_and_labels << []
|
92
|
-
assert_raises(ArgumentError){Model.new(:classification, @docs_and_labels, {}, {}, nil)}
|
93
|
-
end
|
150
|
+
learn_params = { "predfile" => {}}
|
94
151
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
def test_read
|
100
|
-
assert m = Model.read_from_file('test/assets/model')
|
101
|
-
assert_equal 3877, m.support_vectors_count
|
102
|
-
assert_equal 39118, m.total_words
|
103
|
-
end
|
152
|
+
assert_raise(ArgumentError) do
|
153
|
+
Model.new(:classification, @docs_and_labels, learn_params, {}, [1, 0.0, 1])
|
154
|
+
end
|
155
|
+
end
|
104
156
|
|
105
|
-
def test_classify
|
106
|
-
m = Model.read_from_file('test/assets/model')
|
107
|
-
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1.0, 0, 0, 0, 0.5 ].each_with_index.map{|v, i| [i + 1 ,v.to_f]} ) )
|
108
|
-
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1.0, 0, 0, 0, 0.5 ].each_with_index.map{|v, i| [i + 1,v.to_f]}) )
|
109
|
-
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1, 0, 0, 0, 0.8, 0, 0 , 0 ].each_with_index.map{|v, i| [i + 1,v.to_f]}) )
|
110
|
-
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1, 0.5, 0, 0, 0, 0 , 0 ].each_with_index.map{|v, i| [i + 1,v.to_f]}) )
|
111
157
|
end
|
112
|
-
|
113
158
|
end
|
114
159
|
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: svmredlight
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.1.
|
5
|
+
version: 0.1.1
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Camilo Lopez
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-09-
|
13
|
+
date: 2011-09-22 00:00:00 -04:00
|
14
14
|
default_executable:
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
@@ -113,7 +113,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
113
113
|
requirements:
|
114
114
|
- - ">="
|
115
115
|
- !ruby/object:Gem::Version
|
116
|
-
hash:
|
116
|
+
hash: 2966611142819785204
|
117
117
|
segments:
|
118
118
|
- 0
|
119
119
|
version: "0"
|