eluka 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. data/.document +5 -0
  2. data/DOCUMENTATION_STANDARDS +39 -0
  3. data/Gemfile +13 -0
  4. data/Gemfile.lock +20 -0
  5. data/LICENSE.txt +20 -0
  6. data/README.rdoc +19 -0
  7. data/Rakefile +69 -0
  8. data/VERSION +1 -0
  9. data/examples/example.rb +59 -0
  10. data/ext/libsvm/COPYRIGHT +31 -0
  11. data/ext/libsvm/FAQ.html +1749 -0
  12. data/ext/libsvm/Makefile +25 -0
  13. data/ext/libsvm/Makefile.win +33 -0
  14. data/ext/libsvm/README +733 -0
  15. data/ext/libsvm/extconf.rb +1 -0
  16. data/ext/libsvm/heart_scale +270 -0
  17. data/ext/libsvm/java/Makefile +25 -0
  18. data/ext/libsvm/java/libsvm.jar +0 -0
  19. data/ext/libsvm/java/libsvm/svm.java +2776 -0
  20. data/ext/libsvm/java/libsvm/svm.m4 +2776 -0
  21. data/ext/libsvm/java/libsvm/svm_model.java +21 -0
  22. data/ext/libsvm/java/libsvm/svm_node.java +6 -0
  23. data/ext/libsvm/java/libsvm/svm_parameter.java +47 -0
  24. data/ext/libsvm/java/libsvm/svm_print_interface.java +5 -0
  25. data/ext/libsvm/java/libsvm/svm_problem.java +7 -0
  26. data/ext/libsvm/java/svm_predict.java +163 -0
  27. data/ext/libsvm/java/svm_scale.java +350 -0
  28. data/ext/libsvm/java/svm_toy.java +471 -0
  29. data/ext/libsvm/java/svm_train.java +318 -0
  30. data/ext/libsvm/java/test_applet.html +1 -0
  31. data/ext/libsvm/python/Makefile +4 -0
  32. data/ext/libsvm/python/README +331 -0
  33. data/ext/libsvm/python/svm.py +259 -0
  34. data/ext/libsvm/python/svmutil.py +242 -0
  35. data/ext/libsvm/svm-predict.c +226 -0
  36. data/ext/libsvm/svm-scale.c +353 -0
  37. data/ext/libsvm/svm-toy/gtk/Makefile +22 -0
  38. data/ext/libsvm/svm-toy/gtk/callbacks.cpp +423 -0
  39. data/ext/libsvm/svm-toy/gtk/callbacks.h +54 -0
  40. data/ext/libsvm/svm-toy/gtk/interface.c +164 -0
  41. data/ext/libsvm/svm-toy/gtk/interface.h +14 -0
  42. data/ext/libsvm/svm-toy/gtk/main.c +23 -0
  43. data/ext/libsvm/svm-toy/gtk/svm-toy.glade +238 -0
  44. data/ext/libsvm/svm-toy/qt/Makefile +17 -0
  45. data/ext/libsvm/svm-toy/qt/svm-toy.cpp +413 -0
  46. data/ext/libsvm/svm-toy/windows/svm-toy.cpp +456 -0
  47. data/ext/libsvm/svm-train.c +376 -0
  48. data/ext/libsvm/svm.cpp +3060 -0
  49. data/ext/libsvm/svm.def +19 -0
  50. data/ext/libsvm/svm.h +105 -0
  51. data/ext/libsvm/svm.o +0 -0
  52. data/ext/libsvm/tools/README +149 -0
  53. data/ext/libsvm/tools/checkdata.py +108 -0
  54. data/ext/libsvm/tools/easy.py +79 -0
  55. data/ext/libsvm/tools/grid.py +359 -0
  56. data/ext/libsvm/tools/subset.py +146 -0
  57. data/ext/libsvm/windows/libsvm.dll +0 -0
  58. data/ext/libsvm/windows/svm-predict.exe +0 -0
  59. data/ext/libsvm/windows/svm-scale.exe +0 -0
  60. data/ext/libsvm/windows/svm-toy.exe +0 -0
  61. data/ext/libsvm/windows/svm-train.exe +0 -0
  62. data/lib/eluka.rb +10 -0
  63. data/lib/eluka/bijection.rb +23 -0
  64. data/lib/eluka/data_point.rb +36 -0
  65. data/lib/eluka/document.rb +47 -0
  66. data/lib/eluka/feature_vector.rb +86 -0
  67. data/lib/eluka/features.rb +31 -0
  68. data/lib/eluka/model.rb +129 -0
  69. data/lib/fselect.rb +321 -0
  70. data/lib/grid.rb +25 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_eluka.rb +7 -0
  73. metadata +214 -0
@@ -0,0 +1,226 @@
1
+ #include <stdio.h>
2
+ #include <ctype.h>
3
+ #include <stdlib.h>
4
+ #include <string.h>
5
+ #include <errno.h>
6
+ #include "svm.h"
7
+
8
+ struct svm_node *x;
9
+ int max_nr_attr = 64;
10
+
11
+ struct svm_model* model;
12
+ int predict_probability=0;
13
+
14
+ static char *line = NULL;
15
+ static int max_line_len;
16
+
17
+ static char* readline(FILE *input)
18
+ {
19
+ int len;
20
+
21
+ if(fgets(line,max_line_len,input) == NULL)
22
+ return NULL;
23
+
24
+ while(strrchr(line,'\n') == NULL)
25
+ {
26
+ max_line_len *= 2;
27
+ line = (char *) realloc(line,max_line_len);
28
+ len = (int) strlen(line);
29
+ if(fgets(line+len,max_line_len-len,input) == NULL)
30
+ break;
31
+ }
32
+ return line;
33
+ }
34
+
35
+ void exit_input_error(int line_num)
36
+ {
37
+ fprintf(stderr,"Wrong input format at line %d\n", line_num);
38
+ exit(1);
39
+ }
40
+
41
+ void predict(FILE *input, FILE *output)
42
+ {
43
+ int correct = 0;
44
+ int total = 0;
45
+ double error = 0;
46
+ double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0;
47
+
48
+ int svm_type=svm_get_svm_type(model);
49
+ int nr_class=svm_get_nr_class(model);
50
+ double *prob_estimates=NULL;
51
+ int j;
52
+
53
+ if(predict_probability)
54
+ {
55
+ if (svm_type==NU_SVR || svm_type==EPSILON_SVR)
56
+ printf("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=%g\n",svm_get_svr_probability(model));
57
+ else
58
+ {
59
+ int *labels=(int *) malloc(nr_class*sizeof(int));
60
+ svm_get_labels(model,labels);
61
+ prob_estimates = (double *) malloc(nr_class*sizeof(double));
62
+ fprintf(output,"labels");
63
+ for(j=0;j<nr_class;j++)
64
+ fprintf(output," %d",labels[j]);
65
+ fprintf(output,"\n");
66
+ free(labels);
67
+ }
68
+ }
69
+
70
+ max_line_len = 1024;
71
+ line = (char *)malloc(max_line_len*sizeof(char));
72
+ while(readline(input) != NULL)
73
+ {
74
+ int i = 0;
75
+ double target_label, predict_label;
76
+ char *idx, *val, *label, *endptr;
77
+ int inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0
78
+
79
+ label = strtok(line," \t");
80
+ target_label = strtod(label,&endptr);
81
+ if(endptr == label)
82
+ exit_input_error(total+1);
83
+
84
+ while(1)
85
+ {
86
+ if(i>=max_nr_attr-1) // need one more for index = -1
87
+ {
88
+ max_nr_attr *= 2;
89
+ x = (struct svm_node *) realloc(x,max_nr_attr*sizeof(struct svm_node));
90
+ }
91
+
92
+ idx = strtok(NULL,":");
93
+ val = strtok(NULL," \t");
94
+
95
+ if(val == NULL)
96
+ break;
97
+ errno = 0;
98
+ x[i].index = (int) strtol(idx,&endptr,10);
99
+ if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index)
100
+ exit_input_error(total+1);
101
+ else
102
+ inst_max_index = x[i].index;
103
+
104
+ errno = 0;
105
+ x[i].value = strtod(val,&endptr);
106
+ if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
107
+ exit_input_error(total+1);
108
+
109
+ ++i;
110
+ }
111
+ x[i].index = -1;
112
+
113
+ if (predict_probability && (svm_type==C_SVC || svm_type==NU_SVC))
114
+ {
115
+ predict_label = svm_predict_probability(model,x,prob_estimates);
116
+ fprintf(output,"%g",predict_label);
117
+ for(j=0;j<nr_class;j++)
118
+ fprintf(output," %g",prob_estimates[j]);
119
+ fprintf(output,"\n");
120
+ }
121
+ else
122
+ {
123
+ predict_label = svm_predict(model,x);
124
+ fprintf(output,"%g\n",predict_label);
125
+ }
126
+
127
+ if(predict_label == target_label)
128
+ ++correct;
129
+ error += (predict_label-target_label)*(predict_label-target_label);
130
+ sump += predict_label;
131
+ sumt += target_label;
132
+ sumpp += predict_label*predict_label;
133
+ sumtt += target_label*target_label;
134
+ sumpt += predict_label*target_label;
135
+ ++total;
136
+ }
137
+ if (svm_type==NU_SVR || svm_type==EPSILON_SVR)
138
+ {
139
+ printf("Mean squared error = %g (regression)\n",error/total);
140
+ printf("Squared correlation coefficient = %g (regression)\n",
141
+ ((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/
142
+ ((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt))
143
+ );
144
+ }
145
+ else
146
+ printf("Accuracy = %g%% (%d/%d) (classification)\n",
147
+ (double)correct/total*100,correct,total);
148
+ if(predict_probability)
149
+ free(prob_estimates);
150
+ }
151
+
152
+ void exit_with_help()
153
+ {
154
+ printf(
155
+ "Usage: svm-predict [options] test_file model_file output_file\n"
156
+ "options:\n"
157
+ "-b probability_estimates: whether to predict probability estimates, 0 or 1 (default 0); for one-class SVM only 0 is supported\n"
158
+ );
159
+ exit(1);
160
+ }
161
+
162
+ int main(int argc, char **argv)
163
+ {
164
+ FILE *input, *output;
165
+ int i;
166
+
167
+ // parse options
168
+ for(i=1;i<argc;i++)
169
+ {
170
+ if(argv[i][0] != '-') break;
171
+ ++i;
172
+ switch(argv[i-1][1])
173
+ {
174
+ case 'b':
175
+ predict_probability = atoi(argv[i]);
176
+ break;
177
+ default:
178
+ fprintf(stderr,"Unknown option: -%c\n", argv[i-1][1]);
179
+ exit_with_help();
180
+ }
181
+ }
182
+ if(i>=argc-2)
183
+ exit_with_help();
184
+
185
+ input = fopen(argv[i],"r");
186
+ if(input == NULL)
187
+ {
188
+ fprintf(stderr,"can't open input file %s\n",argv[i]);
189
+ exit(1);
190
+ }
191
+
192
+ output = fopen(argv[i+2],"w");
193
+ if(output == NULL)
194
+ {
195
+ fprintf(stderr,"can't open output file %s\n",argv[i+2]);
196
+ exit(1);
197
+ }
198
+
199
+ if((model=svm_load_model(argv[i+1]))==0)
200
+ {
201
+ fprintf(stderr,"can't open model file %s\n",argv[i+1]);
202
+ exit(1);
203
+ }
204
+
205
+ x = (struct svm_node *) malloc(max_nr_attr*sizeof(struct svm_node));
206
+ if(predict_probability)
207
+ {
208
+ if(svm_check_probability_model(model)==0)
209
+ {
210
+ fprintf(stderr,"Model does not support probabiliy estimates\n");
211
+ exit(1);
212
+ }
213
+ }
214
+ else
215
+ {
216
+ if(svm_check_probability_model(model)!=0)
217
+ printf("Model supports probability estimates, but disabled in prediction.\n");
218
+ }
219
+ predict(input,output);
220
+ svm_free_and_destroy_model(&model);
221
+ free(x);
222
+ free(line);
223
+ fclose(input);
224
+ fclose(output);
225
+ return 0;
226
+ }
@@ -0,0 +1,353 @@
1
+ #include <float.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <ctype.h>
5
+ #include <string.h>
6
+
7
+ void exit_with_help()
8
+ {
9
+ printf(
10
+ "Usage: svm-scale [options] data_filename\n"
11
+ "options:\n"
12
+ "-l lower : x scaling lower limit (default -1)\n"
13
+ "-u upper : x scaling upper limit (default +1)\n"
14
+ "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
15
+ "-s save_filename : save scaling parameters to save_filename\n"
16
+ "-r restore_filename : restore scaling parameters from restore_filename\n"
17
+ );
18
+ exit(1);
19
+ }
20
+
21
+ char *line = NULL;
22
+ int max_line_len = 1024;
23
+ double lower=-1.0,upper=1.0,y_lower,y_upper;
24
+ int y_scaling = 0;
25
+ double *feature_max;
26
+ double *feature_min;
27
+ double y_max = -DBL_MAX;
28
+ double y_min = DBL_MAX;
29
+ int max_index;
30
+ long int num_nonzeros = 0;
31
+ long int new_num_nonzeros = 0;
32
+
33
+ #define max(x,y) (((x)>(y))?(x):(y))
34
+ #define min(x,y) (((x)<(y))?(x):(y))
35
+
36
+ void output_target(double value);
37
+ void output(int index, double value);
38
+ char* readline(FILE *input);
39
+
40
+ int main(int argc,char **argv)
41
+ {
42
+ int i,index;
43
+ FILE *fp, *fp_restore = NULL;
44
+ char *save_filename = NULL;
45
+ char *restore_filename = NULL;
46
+
47
+ for(i=1;i<argc;i++)
48
+ {
49
+ if(argv[i][0] != '-') break;
50
+ ++i;
51
+ switch(argv[i-1][1])
52
+ {
53
+ case 'l': lower = atof(argv[i]); break;
54
+ case 'u': upper = atof(argv[i]); break;
55
+ case 'y':
56
+ y_lower = atof(argv[i]);
57
+ ++i;
58
+ y_upper = atof(argv[i]);
59
+ y_scaling = 1;
60
+ break;
61
+ case 's': save_filename = argv[i]; break;
62
+ case 'r': restore_filename = argv[i]; break;
63
+ default:
64
+ fprintf(stderr,"unknown option\n");
65
+ exit_with_help();
66
+ }
67
+ }
68
+
69
+ if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
70
+ {
71
+ fprintf(stderr,"inconsistent lower/upper specification\n");
72
+ exit(1);
73
+ }
74
+
75
+ if(restore_filename && save_filename)
76
+ {
77
+ fprintf(stderr,"cannot use -r and -s simultaneously\n");
78
+ exit(1);
79
+ }
80
+
81
+ if(argc != i+1)
82
+ exit_with_help();
83
+
84
+ fp=fopen(argv[i],"r");
85
+
86
+ if(fp==NULL)
87
+ {
88
+ fprintf(stderr,"can't open file %s\n", argv[i]);
89
+ exit(1);
90
+ }
91
+
92
+ line = (char *) malloc(max_line_len*sizeof(char));
93
+
94
+ #define SKIP_TARGET\
95
+ while(isspace(*p)) ++p;\
96
+ while(!isspace(*p)) ++p;
97
+
98
+ #define SKIP_ELEMENT\
99
+ while(*p!=':') ++p;\
100
+ ++p;\
101
+ while(isspace(*p)) ++p;\
102
+ while(*p && !isspace(*p)) ++p;
103
+
104
+ /* assumption: min index of attributes is 1 */
105
+ /* pass 1: find out max index of attributes */
106
+ max_index = 0;
107
+
108
+ if(restore_filename)
109
+ {
110
+ int idx, c;
111
+
112
+ fp_restore = fopen(restore_filename,"r");
113
+ if(fp_restore==NULL)
114
+ {
115
+ fprintf(stderr,"can't open file %s\n", restore_filename);
116
+ exit(1);
117
+ }
118
+
119
+ c = fgetc(fp_restore);
120
+ if(c == 'y')
121
+ {
122
+ readline(fp_restore);
123
+ readline(fp_restore);
124
+ readline(fp_restore);
125
+ }
126
+ readline(fp_restore);
127
+ readline(fp_restore);
128
+
129
+ while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
130
+ max_index = max(idx,max_index);
131
+ rewind(fp_restore);
132
+ }
133
+
134
+ while(readline(fp)!=NULL)
135
+ {
136
+ char *p=line;
137
+
138
+ SKIP_TARGET
139
+
140
+ while(sscanf(p,"%d:%*f",&index)==1)
141
+ {
142
+ max_index = max(max_index, index);
143
+ SKIP_ELEMENT
144
+ num_nonzeros++;
145
+ }
146
+ }
147
+ rewind(fp);
148
+
149
+ feature_max = (double *)malloc((max_index+1)* sizeof(double));
150
+ feature_min = (double *)malloc((max_index+1)* sizeof(double));
151
+
152
+ if(feature_max == NULL || feature_min == NULL)
153
+ {
154
+ fprintf(stderr,"can't allocate enough memory\n");
155
+ exit(1);
156
+ }
157
+
158
+ for(i=0;i<=max_index;i++)
159
+ {
160
+ feature_max[i]=-DBL_MAX;
161
+ feature_min[i]=DBL_MAX;
162
+ }
163
+
164
+ /* pass 2: find out min/max value */
165
+ while(readline(fp)!=NULL)
166
+ {
167
+ char *p=line;
168
+ int next_index=1;
169
+ double target;
170
+ double value;
171
+
172
+ sscanf(p,"%lf",&target);
173
+ y_max = max(y_max,target);
174
+ y_min = min(y_min,target);
175
+
176
+ SKIP_TARGET
177
+
178
+ while(sscanf(p,"%d:%lf",&index,&value)==2)
179
+ {
180
+ for(i=next_index;i<index;i++)
181
+ {
182
+ feature_max[i]=max(feature_max[i],0);
183
+ feature_min[i]=min(feature_min[i],0);
184
+ }
185
+
186
+ feature_max[index]=max(feature_max[index],value);
187
+ feature_min[index]=min(feature_min[index],value);
188
+
189
+ SKIP_ELEMENT
190
+ next_index=index+1;
191
+ }
192
+
193
+ for(i=next_index;i<=max_index;i++)
194
+ {
195
+ feature_max[i]=max(feature_max[i],0);
196
+ feature_min[i]=min(feature_min[i],0);
197
+ }
198
+ }
199
+
200
+ rewind(fp);
201
+
202
+ /* pass 2.5: save/restore feature_min/feature_max */
203
+
204
+ if(restore_filename)
205
+ {
206
+ /* fp_restore rewinded in finding max_index */
207
+ int idx, c;
208
+ double fmin, fmax;
209
+
210
+ if((c = fgetc(fp_restore)) == 'y')
211
+ {
212
+ fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper);
213
+ fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max);
214
+ y_scaling = 1;
215
+ }
216
+ else
217
+ ungetc(c, fp_restore);
218
+
219
+ if (fgetc(fp_restore) == 'x') {
220
+ fscanf(fp_restore, "%lf %lf\n", &lower, &upper);
221
+ while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
222
+ {
223
+ if(idx<=max_index)
224
+ {
225
+ feature_min[idx] = fmin;
226
+ feature_max[idx] = fmax;
227
+ }
228
+ }
229
+ }
230
+ fclose(fp_restore);
231
+ }
232
+
233
+ if(save_filename)
234
+ {
235
+ FILE *fp_save = fopen(save_filename,"w");
236
+ if(fp_save==NULL)
237
+ {
238
+ fprintf(stderr,"can't open file %s\n", save_filename);
239
+ exit(1);
240
+ }
241
+ if(y_scaling)
242
+ {
243
+ fprintf(fp_save, "y\n");
244
+ fprintf(fp_save, "%.16g %.16g\n", y_lower, y_upper);
245
+ fprintf(fp_save, "%.16g %.16g\n", y_min, y_max);
246
+ }
247
+ fprintf(fp_save, "x\n");
248
+ fprintf(fp_save, "%.16g %.16g\n", lower, upper);
249
+ for(i=1;i<=max_index;i++)
250
+ {
251
+ if(feature_min[i]!=feature_max[i])
252
+ fprintf(fp_save,"%d %.16g %.16g\n",i,feature_min[i],feature_max[i]);
253
+ }
254
+ fclose(fp_save);
255
+ }
256
+
257
+ /* pass 3: scale */
258
+ while(readline(fp)!=NULL)
259
+ {
260
+ char *p=line;
261
+ int next_index=1;
262
+ double target;
263
+ double value;
264
+
265
+ sscanf(p,"%lf",&target);
266
+ output_target(target);
267
+
268
+ SKIP_TARGET
269
+
270
+ while(sscanf(p,"%d:%lf",&index,&value)==2)
271
+ {
272
+ for(i=next_index;i<index;i++)
273
+ output(i,0);
274
+
275
+ output(index,value);
276
+
277
+ SKIP_ELEMENT
278
+ next_index=index+1;
279
+ }
280
+
281
+ for(i=next_index;i<=max_index;i++)
282
+ output(i,0);
283
+
284
+ printf("\n");
285
+ }
286
+
287
+ if (new_num_nonzeros > num_nonzeros)
288
+ fprintf(stderr,
289
+ "Warning: original #nonzeros %ld\n"
290
+ " new #nonzeros %ld\n"
291
+ "Use -l 0 if many original feature values are zeros\n",
292
+ num_nonzeros, new_num_nonzeros);
293
+
294
+ free(line);
295
+ free(feature_max);
296
+ free(feature_min);
297
+ fclose(fp);
298
+ return 0;
299
+ }
300
+
301
+ char* readline(FILE *input)
302
+ {
303
+ int len;
304
+
305
+ if(fgets(line,max_line_len,input) == NULL)
306
+ return NULL;
307
+
308
+ while(strrchr(line,'\n') == NULL)
309
+ {
310
+ max_line_len *= 2;
311
+ line = (char *) realloc(line, max_line_len);
312
+ len = (int) strlen(line);
313
+ if(fgets(line+len,max_line_len-len,input) == NULL)
314
+ break;
315
+ }
316
+ return line;
317
+ }
318
+
319
+ void output_target(double value)
320
+ {
321
+ if(y_scaling)
322
+ {
323
+ if(value == y_min)
324
+ value = y_lower;
325
+ else if(value == y_max)
326
+ value = y_upper;
327
+ else value = y_lower + (y_upper-y_lower) *
328
+ (value - y_min)/(y_max-y_min);
329
+ }
330
+ printf("%g ",value);
331
+ }
332
+
333
+ void output(int index, double value)
334
+ {
335
+ /* skip single-valued attribute */
336
+ if(feature_max[index] == feature_min[index])
337
+ return;
338
+
339
+ if(value == feature_min[index])
340
+ value = lower;
341
+ else if(value == feature_max[index])
342
+ value = upper;
343
+ else
344
+ value = lower + (upper-lower) *
345
+ (value-feature_min[index])/
346
+ (feature_max[index]-feature_min[index]);
347
+
348
+ if(value != 0)
349
+ {
350
+ printf("%d:%g ",index, value);
351
+ new_num_nonzeros++;
352
+ }
353
+ }