svmredlight 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +1 -0
- data/README.rdoc +6 -0
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/ext/svmredlight.c +142 -126
- data/lib/svmredlight.rb +2 -2
- data/lib/svmredlight/document.rb +2 -4
- data/lib/svmredlight/model.rb +44 -1
- data/svmredlight.gemspec +2 -2
- data/test/helper.rb +1 -0
- data/test/test_document.rb +47 -41
- data/test/test_model.rb +120 -75
- metadata +3 -3
data/.document
CHANGED
data/README.rdoc
CHANGED
@@ -7,9 +7,15 @@ A partial interface to SVM-light [http://svmlight.joachims.org/] using it you ca
|
|
7
7
|
|
8
8
|
As of now it's know to work with SVM 6.02.
|
9
9
|
|
10
|
+
=== Installing svmlight as a library
|
11
|
+
|
10
12
|
Make sure to build the libsvmlight.o version of svmlight by using
|
11
13
|
"make libsvmlight_hideo".
|
12
14
|
|
15
|
+
Make sure the .h files in the svmlight distribution are in your include path, inside a subdirectory called svm_light,
|
16
|
+
and the object code for the library is in your include path (/usr/lib for instance).
|
17
|
+
|
18
|
+
|
13
19
|
|
14
20
|
== Document
|
15
21
|
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ require 'rake'
|
|
14
14
|
require 'jeweler'
|
15
15
|
Jeweler::Tasks.new do |gem|
|
16
16
|
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
-
gem.version = '0.1.
|
17
|
+
gem.version = '0.1.1'
|
18
18
|
gem.name = "svmredlight"
|
19
19
|
gem.homepage = "http://github.com/camilo/svmredlight"
|
20
20
|
gem.license = "MIT"
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
data/ext/svmredlight.c
CHANGED
@@ -9,13 +9,10 @@ is_linear(MODEL *model){
|
|
9
9
|
return model->kernel_parm.kernel_type == 0;
|
10
10
|
}
|
11
11
|
|
12
|
-
// Modules and Classes
|
13
12
|
static VALUE rb_mSvmLight;
|
14
13
|
static VALUE rb_cModel;
|
15
14
|
static VALUE rb_cDocument;
|
16
15
|
|
17
|
-
// GC functions
|
18
|
-
|
19
16
|
/* Not using deep free anymore, let ruby call free on the documents otherwise we might end
|
20
17
|
* up having double free problems, from svm_learn_main: Warning: The model contains
|
21
18
|
* references to the original data 'docs'. If you want to free the original data, and
|
@@ -52,11 +49,12 @@ model_read_from_file(VALUE klass, VALUE filename){
|
|
52
49
|
/* Helper function type checks a string meant to be used as a learn_parm, in case of error
|
53
50
|
* returns 1 and sets the correct exception message in error, on success returns 0 and
|
54
51
|
* copies the c string data of new_val to target*/
|
55
|
-
int
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
52
|
+
int
|
53
|
+
check_string_param(VALUE new_val,
|
54
|
+
const char *default_val,
|
55
|
+
char *target,
|
56
|
+
const char *name,
|
57
|
+
char *error){
|
60
58
|
|
61
59
|
if(TYPE(new_val) == T_STRING){
|
62
60
|
strlcpy(target, StringValuePtr(new_val), 199);
|
@@ -73,7 +71,8 @@ int check_string_param(VALUE new_val,
|
|
73
71
|
/* Helper function type checks a long meant to be used as a learn_parm or kernel_parm, in
|
74
72
|
* case of error returns 1 and sets the correct exception message in error, on success
|
75
73
|
* returns 0 and copies the c string data of new_val to target*/
|
76
|
-
int
|
74
|
+
int
|
75
|
+
check_long_param(VALUE new_val,
|
77
76
|
long default_val,
|
78
77
|
long *target,
|
79
78
|
const char *name,
|
@@ -93,11 +92,12 @@ int check_long_param(VALUE new_val,
|
|
93
92
|
/* Helper function type checks a double meant to be used as a learn_parm or kernel_parm, in
|
94
93
|
* case of error returns 1 and sets the correct exception message in error, on success
|
95
94
|
* returns 0 and copies the c string data of new_val to target*/
|
96
|
-
int
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
95
|
+
int
|
96
|
+
check_double_param(VALUE new_val,
|
97
|
+
double default_val,
|
98
|
+
double *target,
|
99
|
+
const char *name,
|
100
|
+
char *error){
|
101
101
|
if(TYPE(new_val) == T_FLOAT || TYPE(new_val) == T_FIXNUM){
|
102
102
|
*target = NUM2DBL(new_val);
|
103
103
|
}else if(NIL_P(new_val) ){
|
@@ -113,11 +113,12 @@ int check_double_param(VALUE new_val,
|
|
113
113
|
/* Helper function type checks an int meant to be used as a boolean learn_parm or
|
114
114
|
* kernel_parm, in case of error returns 1 and sets the correct exception message in
|
115
115
|
* error, on success returns 0 and copies the c string data of new_val to target*/
|
116
|
-
int
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
116
|
+
int
|
117
|
+
check_bool_param(VALUE new_val,
|
118
|
+
long default_val,
|
119
|
+
long *target,
|
120
|
+
const char *name,
|
121
|
+
char *error){
|
121
122
|
if(TYPE(new_val) == T_TRUE){
|
122
123
|
*target = 1L;
|
123
124
|
}else if(TYPE(new_val) == T_FALSE){
|
@@ -134,208 +135,209 @@ int check_bool_param(VALUE new_val,
|
|
134
135
|
|
135
136
|
/* Helper function in charge of setting up the learn parameters before they are passed to
|
136
137
|
* the svm_learn_classification copies part of the logic in svm_learn_main.c */
|
137
|
-
int
|
138
|
+
int
|
139
|
+
setup_learn_params(LEARN_PARM *c_learn_param, VALUE r_hash, char *error_message){
|
138
140
|
// Defaults taken from from svm_learn_main
|
139
141
|
VALUE inter_val, temp_ary, svm_type, svm_type_ruby_str;
|
140
142
|
char *svm_type_str;
|
141
143
|
|
142
144
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("predfile"));
|
143
145
|
if(1 == check_string_param(inter_val,
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
146
|
+
"trans_predictions",
|
147
|
+
(char *)&c_learn_param->predfile,
|
148
|
+
"predfile",
|
149
|
+
error_message)){
|
148
150
|
return 1;
|
149
151
|
}
|
150
152
|
|
151
153
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("alphafile"));
|
152
154
|
if(1 == check_string_param(inter_val,
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
155
|
+
"",
|
156
|
+
(char*)&c_learn_param->alphafile,
|
157
|
+
"alphafile",
|
158
|
+
error_message)){
|
157
159
|
return 1;
|
158
160
|
}
|
159
161
|
|
160
162
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("biased_hyperplane"));
|
161
163
|
if(1 == check_bool_param(inter_val,
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
164
|
+
1L,
|
165
|
+
&(c_learn_param->biased_hyperplane),
|
166
|
+
"biased_hyperplane",
|
167
|
+
error_message)){
|
166
168
|
return 1;
|
167
169
|
}
|
168
170
|
|
169
171
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("sharedslack"));
|
170
172
|
if(1 == check_bool_param(inter_val,
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
173
|
+
0L,
|
174
|
+
&(c_learn_param->sharedslack),
|
175
|
+
"sharedslack",
|
176
|
+
error_message)){
|
175
177
|
return 1;
|
176
178
|
}
|
177
179
|
|
178
180
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("remove_inconsistent"));
|
179
181
|
if(1 == check_bool_param(inter_val,
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
182
|
+
0L,
|
183
|
+
&(c_learn_param->remove_inconsistent),
|
184
|
+
"remove_inconsistent",
|
185
|
+
error_message)){
|
184
186
|
return 1;
|
185
187
|
}
|
186
188
|
|
187
189
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("skip_final_opt_check"));
|
188
190
|
if(1 == check_bool_param(inter_val,
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
191
|
+
0L,
|
192
|
+
&(c_learn_param->skip_final_opt_check),
|
193
|
+
"skip_final_opt_check",
|
194
|
+
error_message)){
|
193
195
|
return 1;
|
194
196
|
}
|
195
197
|
|
196
198
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_newvarsinqp"));
|
197
199
|
if(1 == check_bool_param(inter_val,
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
200
|
+
0L,
|
201
|
+
&(c_learn_param->svm_newvarsinqp),
|
202
|
+
"svm_newvarsinqp",
|
203
|
+
error_message)){
|
202
204
|
return 1;
|
203
205
|
}
|
204
206
|
|
205
207
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("compute_loo"));
|
206
208
|
if(1 == check_bool_param(inter_val,
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
209
|
+
0L,
|
210
|
+
&(c_learn_param->compute_loo),
|
211
|
+
"compute_loo",
|
212
|
+
error_message)){
|
211
213
|
return 1;
|
212
214
|
}
|
213
215
|
|
214
216
|
|
215
217
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_maxqpsize"));
|
216
218
|
if(1 == check_long_param(inter_val,
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
219
|
+
10L,
|
220
|
+
&(c_learn_param->svm_maxqpsize),
|
221
|
+
"svm_maxqpsize",
|
222
|
+
error_message)){
|
221
223
|
return 1;
|
222
224
|
}
|
223
225
|
|
224
226
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_iter_to_shrink"));
|
225
227
|
if(1 == check_long_param(inter_val,
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
228
|
+
-9999,
|
229
|
+
&(c_learn_param->svm_iter_to_shrink),
|
230
|
+
"svm_iter_to_shrink",
|
231
|
+
error_message)){
|
230
232
|
return 1;
|
231
233
|
}
|
232
234
|
|
233
235
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("maxiter"));
|
234
236
|
if(1 == check_long_param(inter_val,
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
237
|
+
100000,
|
238
|
+
&(c_learn_param->maxiter),
|
239
|
+
"maxiter",
|
240
|
+
error_message)){
|
239
241
|
return 1;
|
240
242
|
}
|
241
243
|
|
242
244
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("kernel_cache_size"));
|
243
245
|
if(1 == check_long_param(inter_val,
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
246
|
+
40L,
|
247
|
+
&(c_learn_param->kernel_cache_size),
|
248
|
+
"kernel_cache_size",
|
249
|
+
error_message)){
|
248
250
|
return 1;
|
249
251
|
}
|
250
252
|
|
251
253
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("xa_depth"));
|
252
254
|
if(1 == check_long_param(inter_val,
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
255
|
+
0L,
|
256
|
+
&(c_learn_param->xa_depth),
|
257
|
+
"xa_depth",
|
258
|
+
error_message)){
|
257
259
|
return 1;
|
258
260
|
}
|
259
261
|
|
260
262
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_c"));
|
261
263
|
if(1 == check_double_param(inter_val,
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
264
|
+
0.0,
|
265
|
+
&(c_learn_param->svm_c),
|
266
|
+
"svm_c",
|
267
|
+
error_message)){
|
266
268
|
return 1;
|
267
269
|
}
|
268
270
|
|
269
271
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("eps"));
|
270
272
|
if(1 == check_double_param(inter_val,
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
273
|
+
0.1,
|
274
|
+
&(c_learn_param->eps),
|
275
|
+
"eps",
|
276
|
+
error_message)){
|
275
277
|
return 1;
|
276
278
|
}
|
277
279
|
|
278
280
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("transduction_posratio"));
|
279
281
|
if(1 == check_double_param(inter_val,
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
282
|
+
-1.0,
|
283
|
+
&(c_learn_param->transduction_posratio),
|
284
|
+
"transduction_posratio",
|
285
|
+
error_message)){
|
284
286
|
return 1;
|
285
287
|
}
|
286
288
|
|
287
289
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_costratio"));
|
288
290
|
if(1 == check_double_param(inter_val,
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
291
|
+
1.0,
|
292
|
+
&(c_learn_param->svm_costratio),
|
293
|
+
"svm_costratio",
|
294
|
+
error_message)){
|
293
295
|
return 1;
|
294
296
|
}
|
295
297
|
|
296
298
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_costratio_unlab"));
|
297
299
|
if(1 == check_double_param(inter_val,
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
300
|
+
1.0,
|
301
|
+
&(c_learn_param->svm_costratio_unlab),
|
302
|
+
"svm_costratio_unlab",
|
303
|
+
error_message)){
|
302
304
|
return 1;
|
303
305
|
}
|
304
306
|
|
305
307
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("svm_unlabbound"));
|
306
308
|
if(1 == check_double_param(inter_val,
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
309
|
+
1.0000000000000001e-05,
|
310
|
+
&(c_learn_param->svm_unlabbound),
|
311
|
+
"svm_unlabbound",
|
312
|
+
error_message)){
|
311
313
|
return 1;
|
312
314
|
}
|
313
315
|
|
314
316
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("epsilon_crit"));
|
315
317
|
if(1 == check_double_param(inter_val,
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
318
|
+
0.001,
|
319
|
+
&(c_learn_param->epsilon_crit),
|
320
|
+
"epsilon_crit",
|
321
|
+
error_message)){
|
320
322
|
return 1;
|
321
323
|
}
|
322
324
|
|
323
325
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("epsilon_a"));
|
324
326
|
if(1 == check_double_param(inter_val,
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
327
|
+
1E-15,
|
328
|
+
&(c_learn_param->epsilon_a),
|
329
|
+
"epsilon_a",
|
330
|
+
error_message)){
|
329
331
|
return 1;
|
330
332
|
}
|
331
333
|
|
332
334
|
c_learn_param->rho=1.0;
|
333
335
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("rho"));
|
334
336
|
if(1 == check_double_param(inter_val,
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
337
|
+
1.0,
|
338
|
+
&(c_learn_param->rho),
|
339
|
+
"rho",
|
340
|
+
error_message)){
|
339
341
|
return 1;
|
340
342
|
}
|
341
343
|
|
@@ -343,41 +345,42 @@ int setup_learn_params(LEARN_PARM *c_learn_param, VALUE r_hash, char *error_mess
|
|
343
345
|
return 0;
|
344
346
|
}
|
345
347
|
|
346
|
-
int
|
348
|
+
int
|
349
|
+
setup_kernel_params(KERNEL_PARM *c_kernel_param, VALUE r_hash, char *error_message){
|
347
350
|
VALUE inter_val;
|
348
351
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("poly_degree"));
|
349
352
|
if(1 == check_long_param(inter_val,
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
353
|
+
3L,
|
354
|
+
&(c_kernel_param->poly_degree),
|
355
|
+
"poly_degree",
|
356
|
+
error_message)){
|
354
357
|
return 1;
|
355
358
|
}
|
356
359
|
|
357
360
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("rbf_gamma"));
|
358
361
|
if(1 == check_double_param(inter_val,
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
362
|
+
1.0,
|
363
|
+
&(c_kernel_param->rbf_gamma),
|
364
|
+
"rbf_gamma",
|
365
|
+
error_message)){
|
363
366
|
return 1;
|
364
367
|
}
|
365
368
|
|
366
369
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("coef_lin"));
|
367
370
|
if(1 == check_double_param(inter_val,
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
371
|
+
1.0,
|
372
|
+
&(c_kernel_param->coef_lin),
|
373
|
+
"coef_lin",
|
374
|
+
error_message)){
|
372
375
|
return 1;
|
373
376
|
}
|
374
377
|
|
375
378
|
inter_val = rb_hash_aref(r_hash, rb_str_new2("coef_const"));
|
376
379
|
if(1 == check_double_param(inter_val,
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
380
|
+
1.0,
|
381
|
+
&(c_kernel_param->coef_const),
|
382
|
+
"coef_const",
|
383
|
+
error_message)){
|
381
384
|
return 1;
|
382
385
|
}
|
383
386
|
|
@@ -634,6 +637,18 @@ model_support_vectors_count(VALUE self){
|
|
634
637
|
return INT2FIX(m->sv_num);
|
635
638
|
}
|
636
639
|
|
640
|
+
static VALUE
|
641
|
+
model_write_to_file(VALUE self, VALUE pahtofile){
|
642
|
+
Check_Type(pahtofile, T_STRING);
|
643
|
+
|
644
|
+
MODEL *m;
|
645
|
+
Data_Get_Struct(self, MODEL, m);
|
646
|
+
|
647
|
+
write_model(StringValuePtr(pahtofile), m);
|
648
|
+
|
649
|
+
return Qnil;
|
650
|
+
}
|
651
|
+
|
637
652
|
static VALUE
|
638
653
|
model_total_words(VALUE self){
|
639
654
|
MODEL *m;
|
@@ -745,8 +760,9 @@ Init_svmredlight(){
|
|
745
760
|
rb_mSvmLight = rb_define_module("SVMLight");
|
746
761
|
//Model
|
747
762
|
rb_cModel = rb_define_class_under(rb_mSvmLight, "Model", rb_cObject);
|
748
|
-
rb_define_singleton_method(rb_cModel, "
|
763
|
+
rb_define_singleton_method(rb_cModel, "from_file", model_read_from_file, 1);
|
749
764
|
rb_define_singleton_method(rb_cModel, "learn_classification", model_learn_classification, 5);
|
765
|
+
rb_define_method(rb_cModel, "to_file", model_write_to_file, 1);
|
750
766
|
rb_define_method(rb_cModel, "support_vectors_count", model_support_vectors_count, 0);
|
751
767
|
rb_define_method(rb_cModel, "total_words", model_total_words, 0);
|
752
768
|
rb_define_method(rb_cModel, "classify", model_classify_example, 1);
|
data/lib/svmredlight.rb
CHANGED
data/lib/svmredlight/document.rb
CHANGED
@@ -1,9 +1,7 @@
|
|
1
1
|
module SVMLight
|
2
|
-
# A document is the Ruby representation of a DOC structure in SVMlight, it contains a
|
3
|
-
#
|
4
|
-
# correspondent weights.
|
2
|
+
# A document is the Ruby representation of a DOC structure in SVMlight, it contains a queryid, a slackid, a costfactor
|
3
|
+
# ( c ) and a vector with feature numbers and their correspondent weights.
|
5
4
|
class Document
|
6
|
-
|
7
5
|
# @param [Hash] vector a hash where the keys are feature numbers and the values its weights
|
8
6
|
# @param [Hash] opts the options coincide with SVMLight parameters to the create_example function, the default values for all the options are 0
|
9
7
|
# @option [:docnum] Numeric docum
|
data/lib/svmredlight/model.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
module SVMLight
|
2
|
+
|
3
|
+
class MissingModelFile < StandardError; end
|
2
4
|
# A model is the product of training a SVM, once created it can take documents as inputs
|
3
5
|
# and act of them (by for instance classifying them). Models can also be read from files
|
4
6
|
# created by svm_learn.
|
@@ -16,7 +18,48 @@ module SVMLight
|
|
16
18
|
|
17
19
|
learn_classification(documents_and_lables, learn_params, kernel_params, false, alphas)
|
18
20
|
end
|
19
|
-
|
21
|
+
|
20
22
|
private_class_method :learn_classification
|
23
|
+
private_class_method :from_file
|
24
|
+
|
25
|
+
# in self.read_from_file and #write_to_file
|
26
|
+
#
|
27
|
+
# This is an anti-pattern. Checking for existence of resources is normally something to be avoided. Trying to open
|
28
|
+
# the resource and then rescuing the exception/reading the error code is a much better practice, however SVMLight
|
29
|
+
# will call exit(1) if the file does not exists, and, that in turn will kill the ruby VM, so in this case to
|
30
|
+
# minimize that possibility I'm optimistically check for the file existence and hope it is still there when it is
|
31
|
+
# actually time to open it.
|
32
|
+
#
|
33
|
+
# TODO: Come up with a proper replacement for those methods, probably simply reimplementing them in svmredlight.c
|
34
|
+
# and raising an exception when files cannot be open.
|
35
|
+
|
36
|
+
# Will load an existent model from a file
|
37
|
+
# @param [String] pahtofile path to the model file
|
38
|
+
def self.read_from_file(pahtofile)
|
39
|
+
if File.exists?(pahtofile) && File.file?(pahtofile)
|
40
|
+
from_file(pahtofile)
|
41
|
+
|
42
|
+
else
|
43
|
+
|
44
|
+
raise MissingModelFile, "the #{pahtofile} does not exists or is not a file"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
private :to_file
|
49
|
+
|
50
|
+
# Will create a file containing the model info, the model info can be turn back into a model by using
|
51
|
+
# Model.read_from_file
|
52
|
+
# @param [String] pahtofile
|
53
|
+
def write_to_file(pahtofile)
|
54
|
+
dir = File.dirname(pahtofile)
|
55
|
+
|
56
|
+
if File.directory?(dir) && File.writable?(dir)
|
57
|
+
to_file(pahtofile)
|
58
|
+
|
59
|
+
else
|
60
|
+
raise ModelWriteError, "impossible to write #{pahtofile}"
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
21
64
|
end
|
22
65
|
end
|
data/svmredlight.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{svmredlight}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Camilo Lopez"]
|
12
|
-
s.date = %q{2011-09-
|
12
|
+
s.date = %q{2011-09-22}
|
13
13
|
s.description = %q{Ruby interface to SVMLight}
|
14
14
|
s.email = %q{camilo@camilolopez.com}
|
15
15
|
s.extensions = ["ext/extconf.rb"]
|
data/test/helper.rb
CHANGED
data/test/test_document.rb
CHANGED
@@ -3,53 +3,59 @@ include SVMLight
|
|
3
3
|
|
4
4
|
class TestDocument < Test::Unit::TestCase
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
6
|
+
context "creating a new document" do
|
7
|
+
|
8
|
+
should "succed when using #create" do
|
9
|
+
d = Document.create(0, 0.5, 1, 0, [[1, 1.0 ], [4, 0.0] , [10, 0.0] ,[ 11, 0.5 ]])
|
10
|
+
assert_kind_of Document, d
|
11
|
+
end
|
12
|
+
|
13
|
+
should "accept integers as feature weights" do
|
14
|
+
d = Document.create(0, 0.5, 1, 0, [[1, 0 ], [4, 0.0] , [10, 0.0] ,[ 11, 0.5 ]])
|
15
|
+
assert_kind_of Document, d
|
16
|
+
end
|
17
|
+
|
18
|
+
should "create documents useing new as well" do
|
19
|
+
d = Document.new({1 => 566.0, 4 => 133.0}, {docnum: 10, slackid: 1, queryid: 2, costfactor: 0.5})
|
20
|
+
|
21
|
+
assert_equal 10, d.docnum
|
22
|
+
assert_equal 1, d.slackid
|
23
|
+
assert_equal 2, d.queryid
|
24
|
+
assert_equal 0.5, d.costfactor
|
25
|
+
end
|
26
|
+
|
27
|
+
should "raise argument error if any of the word numbers is less or equal to 0" do
|
28
|
+
assert_raise(ArgumentError){ Document.create(0, 0.5, 1, 0, [[0, 1.0 ], [10, 0.0 ], [20, 0.0], [21, 0.1 ]]) }
|
29
|
+
assert_raise(ArgumentError){ Document.create(1, 0.5, 1, 0, [[-1, 1.0 ], [30, 0.0 ], [40, 0.0], [41, 0.1 ]])}
|
30
|
+
end
|
31
|
+
|
32
|
+
should "raise type error when the fourth argument is not an array" do
|
33
|
+
assert_raise(TypeError) { Document.create(-1, 0, 1, 0, {}) }
|
34
|
+
end
|
35
|
+
|
36
|
+
should "raise type error when the fourth argument is empty" do
|
37
|
+
assert_raise(ArgumentError) { Document.create(-1, 0, 1, 0 [])}
|
38
|
+
end
|
15
39
|
|
16
|
-
def test_create_using_new
|
17
|
-
d = Document.new({1 => 566.0, 4 => 133.0}, {docnum: 10, slackid: 1, queryid: 2, costfactor: 0.5})
|
18
|
-
|
19
|
-
assert_equal 10, d.docnum
|
20
|
-
assert_equal 1, d.slackid
|
21
|
-
assert_equal 2, d.queryid
|
22
|
-
assert_equal 0.5, d.costfactor
|
23
40
|
end
|
41
|
+
|
42
|
+
context 'a document' do
|
43
|
+
should "have accessible docnum, queryid, slackid, and, costfacor" do
|
44
|
+
d1 = Document.create(0, 0.5, 1, 0, [[1, 1.0 ], [10, 0.0 ], [20, 0.0], [21, 0.1 ]])
|
45
|
+
d2 = Document.create(1, 0.6, 2, 1, [[1, 1.0 ], [30, 0.0 ], [40, 0.0], [41, 0.1 ]])
|
24
46
|
|
25
|
-
|
26
|
-
|
27
|
-
d2 = Document.create(1, 0.6, 2, 1, [[1, 1.0 ], [30, 0.0 ], [40, 0.0], [41, 0.1 ]])
|
28
|
-
|
29
|
-
assert_equal 0, d1.docnum
|
30
|
-
assert_equal 1, d2.docnum
|
31
|
-
|
32
|
-
assert_equal 1, d1.slackid
|
33
|
-
assert_equal 2, d2.slackid
|
47
|
+
assert_equal 0, d1.docnum
|
48
|
+
assert_equal 1, d2.docnum
|
34
49
|
|
35
|
-
|
36
|
-
|
50
|
+
assert_equal 1, d1.slackid
|
51
|
+
assert_equal 2, d2.slackid
|
37
52
|
|
38
|
-
|
39
|
-
|
40
|
-
end
|
41
|
-
|
42
|
-
def test_all_word_numbers_should_be_greater_than_zero
|
43
|
-
assert_raise(ArgumentError){ Document.create(0, 0.5, 1, 0, [[0, 1.0 ], [10, 0.0 ], [20, 0.0], [21, 0.1 ]]) }
|
44
|
-
assert_raise(ArgumentError){ Document.create(1, 0.5, 1, 0, [[-1, 1.0 ], [30, 0.0 ], [40, 0.0], [41, 0.1 ]])}
|
45
|
-
end
|
46
|
-
|
47
|
-
def test_create_with_no_array
|
48
|
-
assert_raise(TypeError) { Document.create(-1, 0, 1, 0, {}) }
|
49
|
-
end
|
53
|
+
assert_equal 0, d1.queryid
|
54
|
+
assert_equal 1, d2.queryid
|
50
55
|
|
51
|
-
|
52
|
-
|
56
|
+
assert_equal 0.5, d1.costfactor
|
57
|
+
assert_equal 0.6, d2.costfactor
|
58
|
+
end
|
53
59
|
end
|
54
60
|
end
|
55
61
|
|
data/test/test_model.rb
CHANGED
@@ -3,112 +3,157 @@ include SVMLight
|
|
3
3
|
|
4
4
|
class TestModel < Test::Unit::TestCase
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
6
|
+
context "reading a model from file" do
|
7
|
+
|
8
|
+
setup do
|
9
|
+
@file_name = 'test/assets/model'
|
10
|
+
end
|
11
|
+
|
12
|
+
should "read properly from a well formed file" do
|
13
|
+
assert m = Model.read_from_file(@file_name)
|
14
|
+
assert_equal 3877, m.support_vectors_count
|
15
|
+
assert_equal 39118, m.total_words
|
16
|
+
end
|
16
17
|
|
17
|
-
|
18
|
-
|
19
|
-
assert_kind_of Model, m
|
18
|
+
should "classify successfully after reading the model from a file" do
|
19
|
+
m = Model.read_from_file(@file_name)
|
20
20
|
|
21
|
-
|
22
|
-
assert_kind_of
|
21
|
+
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1.0, 0, 0, 0, 0.5 ].each_with_index.map{|v, i| [i + 1 ,v.to_f]} ) )
|
22
|
+
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1.0, 0, 0, 0, 0.5 ].each_with_index.map{|v, i| [i + 1,v.to_f]}) )
|
23
|
+
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1, 0, 0, 0, 0.8, 0, 0 , 0 ].each_with_index.map{|v, i| [i + 1,v.to_f]}) )
|
24
|
+
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1, 0.5, 0, 0, 0, 0 , 0 ].each_with_index.map{|v, i| [i + 1,v.to_f]}) )
|
25
|
+
end
|
26
|
+
|
27
|
+
should "raise file not found exception when file does not exists" do
|
28
|
+
assert_raises(MissingModelFile){ Model.read_from_file(@file_name + 'bleh') }
|
23
29
|
end
|
24
30
|
end
|
25
31
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
32
|
+
context "writting a model to a file" do
|
33
|
+
setup do
|
34
|
+
@features ||= [
|
35
|
+
[ [1,0.6], [11, 0.0], [34, 0.1] ],
|
36
|
+
[ [5,0.4], [15, 0.0], [30, 0.1] ],
|
37
|
+
[ [1,0.1], [13, 0.0], [31, 0.1] ],
|
38
|
+
[ [7,0.7], [15, 0.0], [35, 0.1] ],
|
39
|
+
[ [5,0.6], [19, 0.0], [44, 0.1] ],
|
40
|
+
]
|
41
|
+
|
42
|
+
@docs_and_labels ||= @features.each_with_index.map do |feature, index|
|
43
|
+
[ Document.create(index + 1, 1, 0, 0, feature), index%2 * -1]
|
44
|
+
end
|
45
|
+
|
46
|
+
@filepath = './test/assets/written_model'
|
47
|
+
@model = Model.new(:classification, @docs_and_labels, {}, {}, nil)
|
48
|
+
end
|
49
|
+
|
50
|
+
should "write a model from memmory to a file" do
|
51
|
+
@model.write_to_file(@filepath)
|
31
52
|
|
32
|
-
|
33
|
-
|
53
|
+
assert File.exists?(@filepath)
|
54
|
+
assert File.file?(@filepath)
|
55
|
+
# TODO: Implement actual model equality
|
56
|
+
assert_equal @model.support_vectors_count, Model.read_from_file(@filepath).support_vectors_count
|
34
57
|
end
|
35
58
|
|
59
|
+
# Need to find a good way to test this without relaying too much in the environment
|
60
|
+
should "raise ModelWriteError when it is impossible to write a model file"
|
61
|
+
|
62
|
+
teardown do
|
63
|
+
`rm #{@filepath} &> /dev/null`
|
64
|
+
end
|
36
65
|
end
|
37
66
|
|
38
|
-
|
39
|
-
|
40
|
-
|
67
|
+
context "when learning from new documents" do
|
68
|
+
|
69
|
+
setup do
|
70
|
+
@features ||= [
|
71
|
+
[ [1,0.6], [11, 0.0], [34, 0.1] ],
|
72
|
+
[ [5,0.4], [15, 0.0], [30, 0.1] ],
|
73
|
+
[ [1,0.1], [13, 0.0], [31, 0.1] ],
|
74
|
+
[ [7,0.7], [15, 0.0], [35, 0.1] ],
|
75
|
+
[ [5,0.6], [19, 0.0], [44, 0.1] ],
|
76
|
+
]
|
77
|
+
|
78
|
+
@docs_and_labels ||= @features.each_with_index.map do |feature, index|
|
79
|
+
[ Document.create(index + 1, 1, 0, 0, feature), index%2 * -1]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
should "learn classification with default arguments" do
|
84
|
+
m = Model.new(:classification, @docs_and_labels, {}, {}, nil)
|
85
|
+
assert_kind_of Model, m
|
86
|
+
assert_equal 44, m.total_words
|
87
|
+
assert_equal 5, m.totdoc
|
88
|
+
|
89
|
+
@docs_and_labels.each_with_index do |item, i|
|
90
|
+
assert_kind_of Numeric, m.classify(item.first), "failed in item # #{i}"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
should "learn classification with alpha values" do
|
95
|
+
m = Model.new(:classification, @docs_and_labels, {}, {}, [1, 0.0] * 50)
|
96
|
+
assert_kind_of Model, m
|
97
|
+
|
98
|
+
@docs_and_labels.each_with_index do |item, i|
|
99
|
+
assert_kind_of Numeric, m.classify(item.first), "failed in item # #{i}"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
should "raise argument error when one of the alphas is not numeric " do
|
104
|
+
assert_raises(ArgumentError){Model.new(:classification, @docs_and_labels, {}, {}, [1, {}] )}
|
105
|
+
end
|
106
|
+
|
107
|
+
should "learn classification and accept learn parameters" do
|
108
|
+
learn_params = {
|
41
109
|
"predfile" => "custom_file",
|
42
110
|
"alphafile" => "alpha",
|
43
111
|
"biased_hyperplane" => false,
|
44
112
|
"sharedslack" => false,
|
45
113
|
"remove_inconsistent" => true
|
46
|
-
|
114
|
+
}
|
47
115
|
|
48
|
-
|
49
|
-
|
116
|
+
m = Model.new(:classification, @docs_and_labels, learn_params, {}, nil)
|
117
|
+
assert_kind_of Model, m
|
50
118
|
|
51
|
-
|
52
|
-
|
119
|
+
@docs_and_labels.each_with_index do |item, i|
|
120
|
+
assert_kind_of Numeric, m.classify(item.first), "failed in item # #{i}"
|
121
|
+
end
|
53
122
|
end
|
54
|
-
end
|
55
123
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
124
|
+
should "raise argument error when learn parameters are invalid" do
|
125
|
+
learn_params = {"svm_c" => -1}
|
126
|
+
assert_raises(ArgumentError){Model.new(:classification, @docs_and_labels, learn_params, {}, nil)}
|
127
|
+
learn_params = {"svm_iter_to_shrink" => -1}
|
128
|
+
assert_raises(ArgumentError){Model.new(:classification, @docs_and_labels, learn_params, {}, nil)}
|
129
|
+
end
|
130
|
+
|
131
|
+
should "learn calssification while accepting kernel paramters" do
|
62
132
|
|
63
|
-
|
64
|
-
|
65
|
-
kernel_params = {
|
133
|
+
kernel_params = {
|
66
134
|
"poly_degree" => 3,
|
67
135
|
"rbf_gamma" => 0.5,
|
68
136
|
"coef_lin" => 0.4,
|
69
137
|
"coef_const" => 0.56
|
70
|
-
|
138
|
+
}
|
71
139
|
|
72
|
-
|
73
|
-
|
140
|
+
m = Model.new(:classification, @docs_and_labels, {}, kernel_params, nil)
|
141
|
+
assert_kind_of Model, m
|
74
142
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
end
|
79
|
-
|
80
|
-
def test_learn_classification_with_learn_params_when_predfile_is_not_string
|
81
|
-
|
82
|
-
learn_params = { "predfile" => {}}
|
83
|
-
|
84
|
-
assert_raise(ArgumentError) do
|
85
|
-
Model.new(:classification, @docs_and_labels, learn_params, {}, [1, 0.0, 1])
|
143
|
+
@docs_and_labels.each_with_index do |item, i|
|
144
|
+
assert_kind_of Numeric, m.classify(item.first), "failed in item # #{i}"
|
145
|
+
end
|
86
146
|
end
|
87
147
|
|
88
|
-
|
148
|
+
should "raise argument error when predfile is not string" do
|
89
149
|
|
90
|
-
|
91
|
-
@docs_and_labels << []
|
92
|
-
assert_raises(ArgumentError){Model.new(:classification, @docs_and_labels, {}, {}, nil)}
|
93
|
-
end
|
150
|
+
learn_params = { "predfile" => {}}
|
94
151
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
def test_read
|
100
|
-
assert m = Model.read_from_file('test/assets/model')
|
101
|
-
assert_equal 3877, m.support_vectors_count
|
102
|
-
assert_equal 39118, m.total_words
|
103
|
-
end
|
152
|
+
assert_raise(ArgumentError) do
|
153
|
+
Model.new(:classification, @docs_and_labels, learn_params, {}, [1, 0.0, 1])
|
154
|
+
end
|
155
|
+
end
|
104
156
|
|
105
|
-
def test_classify
|
106
|
-
m = Model.read_from_file('test/assets/model')
|
107
|
-
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1.0, 0, 0, 0, 0.5 ].each_with_index.map{|v, i| [i + 1 ,v.to_f]} ) )
|
108
|
-
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1.0, 0, 0, 0, 0.5 ].each_with_index.map{|v, i| [i + 1,v.to_f]}) )
|
109
|
-
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1, 0, 0, 0, 0.8, 0, 0 , 0 ].each_with_index.map{|v, i| [i + 1,v.to_f]}) )
|
110
|
-
assert_kind_of Numeric, m.classify( Document.create(-1, 1, 0, 0,[1, 0.5, 0, 0, 0, 0 , 0 ].each_with_index.map{|v, i| [i + 1,v.to_f]}) )
|
111
157
|
end
|
112
|
-
|
113
158
|
end
|
114
159
|
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: svmredlight
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.1.
|
5
|
+
version: 0.1.1
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Camilo Lopez
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-09-
|
13
|
+
date: 2011-09-22 00:00:00 -04:00
|
14
14
|
default_executable:
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
@@ -113,7 +113,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
113
113
|
requirements:
|
114
114
|
- - ">="
|
115
115
|
- !ruby/object:Gem::Version
|
116
|
-
hash:
|
116
|
+
hash: 2966611142819785204
|
117
117
|
segments:
|
118
118
|
- 0
|
119
119
|
version: "0"
|