eluka 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (73) hide show
  1. data/.document +5 -0
  2. data/DOCUMENTATION_STANDARDS +39 -0
  3. data/Gemfile +13 -0
  4. data/Gemfile.lock +20 -0
  5. data/LICENSE.txt +20 -0
  6. data/README.rdoc +19 -0
  7. data/Rakefile +69 -0
  8. data/VERSION +1 -0
  9. data/examples/example.rb +59 -0
  10. data/ext/libsvm/COPYRIGHT +31 -0
  11. data/ext/libsvm/FAQ.html +1749 -0
  12. data/ext/libsvm/Makefile +25 -0
  13. data/ext/libsvm/Makefile.win +33 -0
  14. data/ext/libsvm/README +733 -0
  15. data/ext/libsvm/extconf.rb +1 -0
  16. data/ext/libsvm/heart_scale +270 -0
  17. data/ext/libsvm/java/Makefile +25 -0
  18. data/ext/libsvm/java/libsvm.jar +0 -0
  19. data/ext/libsvm/java/libsvm/svm.java +2776 -0
  20. data/ext/libsvm/java/libsvm/svm.m4 +2776 -0
  21. data/ext/libsvm/java/libsvm/svm_model.java +21 -0
  22. data/ext/libsvm/java/libsvm/svm_node.java +6 -0
  23. data/ext/libsvm/java/libsvm/svm_parameter.java +47 -0
  24. data/ext/libsvm/java/libsvm/svm_print_interface.java +5 -0
  25. data/ext/libsvm/java/libsvm/svm_problem.java +7 -0
  26. data/ext/libsvm/java/svm_predict.java +163 -0
  27. data/ext/libsvm/java/svm_scale.java +350 -0
  28. data/ext/libsvm/java/svm_toy.java +471 -0
  29. data/ext/libsvm/java/svm_train.java +318 -0
  30. data/ext/libsvm/java/test_applet.html +1 -0
  31. data/ext/libsvm/python/Makefile +4 -0
  32. data/ext/libsvm/python/README +331 -0
  33. data/ext/libsvm/python/svm.py +259 -0
  34. data/ext/libsvm/python/svmutil.py +242 -0
  35. data/ext/libsvm/svm-predict.c +226 -0
  36. data/ext/libsvm/svm-scale.c +353 -0
  37. data/ext/libsvm/svm-toy/gtk/Makefile +22 -0
  38. data/ext/libsvm/svm-toy/gtk/callbacks.cpp +423 -0
  39. data/ext/libsvm/svm-toy/gtk/callbacks.h +54 -0
  40. data/ext/libsvm/svm-toy/gtk/interface.c +164 -0
  41. data/ext/libsvm/svm-toy/gtk/interface.h +14 -0
  42. data/ext/libsvm/svm-toy/gtk/main.c +23 -0
  43. data/ext/libsvm/svm-toy/gtk/svm-toy.glade +238 -0
  44. data/ext/libsvm/svm-toy/qt/Makefile +17 -0
  45. data/ext/libsvm/svm-toy/qt/svm-toy.cpp +413 -0
  46. data/ext/libsvm/svm-toy/windows/svm-toy.cpp +456 -0
  47. data/ext/libsvm/svm-train.c +376 -0
  48. data/ext/libsvm/svm.cpp +3060 -0
  49. data/ext/libsvm/svm.def +19 -0
  50. data/ext/libsvm/svm.h +105 -0
  51. data/ext/libsvm/svm.o +0 -0
  52. data/ext/libsvm/tools/README +149 -0
  53. data/ext/libsvm/tools/checkdata.py +108 -0
  54. data/ext/libsvm/tools/easy.py +79 -0
  55. data/ext/libsvm/tools/grid.py +359 -0
  56. data/ext/libsvm/tools/subset.py +146 -0
  57. data/ext/libsvm/windows/libsvm.dll +0 -0
  58. data/ext/libsvm/windows/svm-predict.exe +0 -0
  59. data/ext/libsvm/windows/svm-scale.exe +0 -0
  60. data/ext/libsvm/windows/svm-toy.exe +0 -0
  61. data/ext/libsvm/windows/svm-train.exe +0 -0
  62. data/lib/eluka.rb +10 -0
  63. data/lib/eluka/bijection.rb +23 -0
  64. data/lib/eluka/data_point.rb +36 -0
  65. data/lib/eluka/document.rb +47 -0
  66. data/lib/eluka/feature_vector.rb +86 -0
  67. data/lib/eluka/features.rb +31 -0
  68. data/lib/eluka/model.rb +129 -0
  69. data/lib/fselect.rb +321 -0
  70. data/lib/grid.rb +25 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_eluka.rb +7 -0
  73. metadata +214 -0
@@ -0,0 +1,19 @@
1
+ LIBRARY libsvm
2
+ EXPORTS
3
+ svm_train @1
4
+ svm_cross_validation @2
5
+ svm_save_model @3
6
+ svm_load_model @4
7
+ svm_get_svm_type @5
8
+ svm_get_nr_class @6
9
+ svm_get_labels @7
10
+ svm_get_svr_probability @8
11
+ svm_predict_values @9
12
+ svm_predict @10
13
+ svm_predict_probability @11
14
+ svm_free_model_content @12
15
+ svm_free_and_destroy_model @13
16
+ svm_destroy_param @14
17
+ svm_check_parameter @15
18
+ svm_check_probability_model @16
19
+ svm_set_print_string_function @17
data/ext/libsvm/svm.h ADDED
@@ -0,0 +1,105 @@
1
+ #ifndef _LIBSVM_H
2
+ #define _LIBSVM_H
3
+
4
+ #define LIBSVM_VERSION 300
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ extern int libsvm_version;
11
+
12
+ struct svm_node
13
+ {
14
+ int index;
15
+ double value;
16
+ };
17
+
18
+ struct svm_problem
19
+ {
20
+ int l;
21
+ double *y;
22
+ struct svm_node **x;
23
+ };
24
+
25
+ enum { C_SVC, NU_SVC, ONE_CLASS, EPSILON_SVR, NU_SVR }; /* svm_type */
26
+ enum { LINEAR, POLY, RBF, SIGMOID, PRECOMPUTED }; /* kernel_type */
27
+
28
+ struct svm_parameter
29
+ {
30
+ int svm_type;
31
+ int kernel_type;
32
+ int degree; /* for poly */
33
+ double gamma; /* for poly/rbf/sigmoid */
34
+ double coef0; /* for poly/sigmoid */
35
+
36
+ /* these are for training only */
37
+ double cache_size; /* in MB */
38
+ double eps; /* stopping criteria */
39
+ double C; /* for C_SVC, EPSILON_SVR and NU_SVR */
40
+ int nr_weight; /* for C_SVC */
41
+ int *weight_label; /* for C_SVC */
42
+ double* weight; /* for C_SVC */
43
+ double nu; /* for NU_SVC, ONE_CLASS, and NU_SVR */
44
+ double p; /* for EPSILON_SVR */
45
+ int shrinking; /* use the shrinking heuristics */
46
+ int probability; /* do probability estimates */
47
+ };
48
+
49
+ //
50
+ // svm_model
51
+ //
52
+ struct svm_model
53
+ {
54
+ struct svm_parameter param; /* parameter */
55
+ int nr_class; /* number of classes, = 2 in regression/one class svm */
56
+ int l; /* total #SV */
57
+ struct svm_node **SV; /* SVs (SV[l]) */
58
+ double **sv_coef; /* coefficients for SVs in decision functions (sv_coef[k-1][l]) */
59
+ double *rho; /* constants in decision functions (rho[k*(k-1)/2]) */
60
+ double *probA; /* pariwise probability information */
61
+ double *probB;
62
+
63
+ /* for classification only */
64
+
65
+ int *label; /* label of each class (label[k]) */
66
+ int *nSV; /* number of SVs for each class (nSV[k]) */
67
+ /* nSV[0] + nSV[1] + ... + nSV[k-1] = l */
68
+ /* XXX */
69
+ int free_sv; /* 1 if svm_model is created by svm_load_model*/
70
+ /* 0 if svm_model is created by svm_train */
71
+ };
72
+
73
+ struct svm_model *svm_train(const struct svm_problem *prob, const struct svm_parameter *param);
74
+ void svm_cross_validation(const struct svm_problem *prob, const struct svm_parameter *param, int nr_fold, double *target);
75
+
76
+ int svm_save_model(const char *model_file_name, const struct svm_model *model);
77
+ struct svm_model *svm_load_model(const char *model_file_name);
78
+
79
+ int svm_get_svm_type(const struct svm_model *model);
80
+ int svm_get_nr_class(const struct svm_model *model);
81
+ void svm_get_labels(const struct svm_model *model, int *label);
82
+ double svm_get_svr_probability(const struct svm_model *model);
83
+
84
+ double svm_predict_values(const struct svm_model *model, const struct svm_node *x, double* dec_values);
85
+ double svm_predict(const struct svm_model *model, const struct svm_node *x);
86
+ double svm_predict_probability(const struct svm_model *model, const struct svm_node *x, double* prob_estimates);
87
+
88
+ void svm_free_model_content(struct svm_model *model_ptr);
89
+ void svm_free_and_destroy_model(struct svm_model **model_ptr_ptr);
90
+ void svm_destroy_param(struct svm_parameter *param);
91
+
92
+ const char *svm_check_parameter(const struct svm_problem *prob, const struct svm_parameter *param);
93
+ int svm_check_probability_model(const struct svm_model *model);
94
+
95
+ void svm_set_print_string_function(void (*print_func)(const char *));
96
+
97
+ // deprecated
98
+ // this function will be removed in future release
99
+ void svm_destroy_model(struct svm_model *model_ptr);
100
+
101
+ #ifdef __cplusplus
102
+ }
103
+ #endif
104
+
105
+ #endif /* _LIBSVM_H */
data/ext/libsvm/svm.o ADDED
Binary file
@@ -0,0 +1,149 @@
1
+ This directory includes some useful codes:
2
+
3
+ 1. subset selection tools.
4
+ 2. parameter selection tools.
5
+ 3. LIBSVM format checking tools
6
+
7
+ Part I: Subset selection tools
8
+
9
+ Introduction
10
+ ============
11
+
12
+ Training large data is time consuming. Sometimes one should work on a
13
+ smaller subset first. The python script subset.py randomly selects a
14
+ specified number of samples. For classification data, we provide a
15
+ stratified selection to ensure the same class distribution in the
16
+ subset.
17
+
18
+ Usage: subset.py [options] dataset number [output1] [output2]
19
+
20
+ This script selects a subset of the given data set.
21
+
22
+ options:
23
+ -s method : method of selection (default 0)
24
+ 0 -- stratified selection (classification only)
25
+ 1 -- random selection
26
+
27
+ output1 : the subset (optional)
28
+ output2 : the rest of data (optional)
29
+
30
+ If output1 is omitted, the subset will be printed on the screen.
31
+
32
+ Example
33
+ =======
34
+
35
+ > python subset.py heart_scale 100 file1 file2
36
+
37
+ From heart_scale 100 samples are randomly selected and stored in
38
+ file1. All remaining instances are stored in file2.
39
+
40
+
41
+ Part II: Parameter Selection Tools
42
+
43
+ Introduction
44
+ ============
45
+
46
+ grid.py is a parameter selection tool for C-SVM classification using
47
+ the RBF (radial basis function) kernel. It uses cross validation (CV)
48
+ technique to estimate the accuracy of each parameter combination in
49
+ the specified range and helps you to decide the best parameters for
50
+ your problem.
51
+
52
+ grid.py directly executes libsvm binaries (so no python binding is needed)
53
+ for cross validation and then draw contour of CV accuracy using gnuplot.
54
+ You must have libsvm and gnuplot installed before using it. The package
55
+ gnuplot is available at http://www.gnuplot.info/
56
+
57
+ On Mac OSX, the precompiled gnuplot file needs the library Aquarterm,
58
+ which thus must be installed as well. In addition, this version of
59
+ gnuplot does not support png, so you need to change "set term png
60
+ transparent small" and use other image formats. For example, you may
61
+ have "set term pbm small color".
62
+
63
+ Usage: grid.py [-log2c begin,end,step] [-log2g begin,end,step] [-v fold]
64
+ [-svmtrain pathname] [-gnuplot pathname] [-out pathname] [-png pathname]
65
+ [additional parameters for svm-train] dataset
66
+
67
+ The program conducts v-fold cross validation using parameter C (and gamma)
68
+ = 2^begin, 2^(begin+step), ..., 2^end.
69
+
70
+ You can specify where the libsvm executable and gnuplot are using the
71
+ -svmtrain and -gnuplot parameters.
72
+
73
+ For windows users, please use pgnuplot.exe. If you are using gnuplot
74
+ 3.7.1, please upgrade to version 3.7.3 or higher. The version 3.7.1
75
+ has a bug. If you use cygwin on windows, please use gunplot-x11.
76
+
77
+ Example
78
+ =======
79
+
80
+ > python grid.py -log2c -5,5,1 -log2g -4,0,1 -v 5 -m 300 heart_scale
81
+
82
+ Users (in particular MS Windows users) may need to specify the path of
83
+ executable files. You can either change paths in the beginning of
84
+ grid.py or specify them in the command line. For example,
85
+
86
+ > grid.py -log2c -5,5,1 -svmtrain c:\libsvm\windows\svm-train.exe -gnuplot c:\tmp\gnuplot\bin\pgnuplot.exe -v 10 heart_scale
87
+
88
+ Output: two files
89
+ dataset.png: the CV accuracy contour plot generated by gnuplot
90
+ dataset.out: the CV accuracy at each (log2(C),log2(gamma))
91
+
92
+ Parallel grid search
93
+ ====================
94
+
95
+ You can conduct a parallel grid search by dispatching jobs to a
96
+ cluster of computers which share the same file system. First, you add
97
+ machine names in grid.py:
98
+
99
+ ssh_workers = ["linux1", "linux5", "linux5"]
100
+
101
+ and then setup your ssh so that the authentication works without
102
+ asking a password.
103
+
104
+ The same machine (e.g., linux5 here) can be listed more than once if
105
+ it has multiple CPUs or has more RAM. If the local machine is the
106
+ best, you can also enlarge the nr_local_worker. For example:
107
+
108
+ nr_local_worker = 2
109
+
110
+ Example:
111
+
112
+ > python grid.py heart_scale
113
+ [local] -1 -1 78.8889 (best c=0.5, g=0.5, rate=78.8889)
114
+ [linux5] -1 -7 83.3333 (best c=0.5, g=0.0078125, rate=83.3333)
115
+ [linux5] 5 -1 77.037 (best c=0.5, g=0.0078125, rate=83.3333)
116
+ [linux1] 5 -7 83.3333 (best c=0.5, g=0.0078125, rate=83.3333)
117
+ .
118
+ .
119
+ .
120
+
121
+ If -log2c, -log2g, or -v is not specified, default values are used.
122
+
123
+ If your system uses telnet instead of ssh, you list the computer names
124
+ in telnet_workers.
125
+
126
+ Part III: LIBSVM format checking tools
127
+
128
+ Introduction
129
+ ============
130
+
131
+ `svm-train' conducts only a simple check of the input data. To do a
132
+ detailed check, we provide a python script `checkdata.py.'
133
+
134
+ Usage: checkdata.py dataset
135
+
136
+ Exit status (returned value): 1 if there are errors, 0 otherwise.
137
+
138
+ This tool is written by Rong-En Fan at National Taiwan University.
139
+
140
+ Example
141
+ =======
142
+
143
+ > cat bad_data
144
+ 1 3:1 2:4
145
+ > python checkdata.py bad_data
146
+ line 1: feature indices must be in an ascending order, previous/current features 3:1 2:4
147
+ Found 1 lines with error.
148
+
149
+
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/env python
2
+
3
+ #
4
+ # A format checker for LIBSVM
5
+ #
6
+
7
+ #
8
+ # Copyright (c) 2007, Rong-En Fan
9
+ #
10
+ # All rights reserved.
11
+ #
12
+ # This program is distributed under the same license of the LIBSVM package.
13
+ #
14
+
15
+ from sys import argv, exit
16
+ import os.path
17
+
18
+ def err(line_no, msg):
19
+ print("line %d: %s" % (line_no, msg))
20
+
21
+ # works like float() but does not accept nan and inf
22
+ def my_float(x):
23
+ if x.lower().find("nan") != -1 or x.lower().find("inf") != -1:
24
+ raise ValueError
25
+
26
+ return float(x)
27
+
28
+ def main():
29
+ if len(argv) != 2:
30
+ print("Usage: %s dataset" % (argv[0]))
31
+ exit(1)
32
+
33
+ dataset = argv[1]
34
+
35
+ if not os.path.exists(dataset):
36
+ print("dataset %s not found" % (dataset))
37
+ exit(1)
38
+
39
+ line_no = 1
40
+ error_line_count = 0
41
+ for line in open(dataset, 'r'):
42
+ line_error = False
43
+
44
+ # each line must end with a newline character
45
+ if line[-1] != '\n':
46
+ err(line_no, "missing a newline character in the end")
47
+ line_error = True
48
+
49
+ nodes = line.split()
50
+
51
+ # check label
52
+ try:
53
+ label = nodes.pop(0)
54
+
55
+ if label.find(',') != -1:
56
+ # multi-label format
57
+ try:
58
+ for l in label.split(','):
59
+ l = my_float(l)
60
+ except:
61
+ err(line_no, "label %s is not a valid multi-label form" % label)
62
+ line_error = True
63
+ else:
64
+ try:
65
+ label = my_float(label)
66
+ except:
67
+ err(line_no, "label %s is not a number" % label)
68
+ line_error = True
69
+ except:
70
+ err(line_no, "missing label, perhaps an empty line?")
71
+ line_error = True
72
+
73
+ # check features
74
+ prev_index = -1
75
+ for i in range(len(nodes)):
76
+ try:
77
+ (index, value) = nodes[i].split(':')
78
+
79
+ index = int(index)
80
+ value = my_float(value)
81
+
82
+ # precomputed kernel's index starts from 0 and LIBSVM
83
+ # checks it. Hence, don't treat index 0 as an error.
84
+ if index < 0:
85
+ err(line_no, "feature index must be positive; wrong feature %s" % nodes[i])
86
+ line_error = True
87
+ elif index < prev_index:
88
+ err(line_no, "feature indices must be in an ascending order, previous/current features %s %s" % (nodes[i-1], nodes[i]))
89
+ line_error = True
90
+ prev_index = index
91
+ except:
92
+ err(line_no, "feature '%s' not an <index>:<value> pair, <index> integer, <value> real number " % nodes[i])
93
+ line_error = True
94
+
95
+ line_no += 1
96
+
97
+ if line_error:
98
+ error_line_count += 1
99
+
100
+ if error_line_count > 0:
101
+ print("Found %d lines with error." % (error_line_count))
102
+ return 1
103
+ else:
104
+ print("No error.")
105
+ return 0
106
+
107
+ if __name__ == "__main__":
108
+ exit(main())
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ import os
5
+ from subprocess import *
6
+
7
+ if len(sys.argv) <= 1:
8
+ print('Usage: %s training_file [testing_file]' % sys.argv[0])
9
+ raise SystemExit
10
+
11
+ # svm, grid, and gnuplot executable files
12
+
13
+ is_win32 = (sys.platform == 'win32')
14
+ if not is_win32:
15
+ svmscale_exe = "../svm-scale"
16
+ svmtrain_exe = "../svm-train"
17
+ svmpredict_exe = "../svm-predict"
18
+ grid_py = "./grid.py"
19
+ gnuplot_exe = "/usr/bin/gnuplot"
20
+ else:
21
+ # example for windows
22
+ svmscale_exe = r"..\windows\svm-scale.exe"
23
+ svmtrain_exe = r"..\windows\svm-train.exe"
24
+ svmpredict_exe = r"..\windows\svm-predict.exe"
25
+ gnuplot_exe = r"c:\tmp\gnuplot\bin\pgnuplot.exe"
26
+ grid_py = r".\grid.py"
27
+
28
+ assert os.path.exists(svmscale_exe),"svm-scale executable not found"
29
+ assert os.path.exists(svmtrain_exe),"svm-train executable not found"
30
+ assert os.path.exists(svmpredict_exe),"svm-predict executable not found"
31
+ assert os.path.exists(gnuplot_exe),"gnuplot executable not found"
32
+ assert os.path.exists(grid_py),"grid.py not found"
33
+
34
+ train_pathname = sys.argv[1]
35
+ assert os.path.exists(train_pathname),"training file not found"
36
+ file_name = os.path.split(train_pathname)[1]
37
+ scaled_file = file_name + ".scale"
38
+ model_file = file_name + ".model"
39
+ range_file = file_name + ".range"
40
+
41
+ if len(sys.argv) > 2:
42
+ test_pathname = sys.argv[2]
43
+ file_name = os.path.split(test_pathname)[1]
44
+ assert os.path.exists(test_pathname),"testing file not found"
45
+ scaled_test_file = file_name + ".scale"
46
+ predict_test_file = file_name + ".predict"
47
+
48
+ cmd = '%s -s "%s" "%s" > "%s"' % (svmscale_exe, range_file, train_pathname, scaled_file)
49
+ print('Scaling training data...')
50
+ Popen(cmd, shell = True, stdout = PIPE).communicate()
51
+
52
+ cmd = '%s -svmtrain "%s" -gnuplot "%s" "%s"' % (grid_py, svmtrain_exe, gnuplot_exe, scaled_file)
53
+ print('Cross validation...')
54
+ f = Popen(cmd, shell = True, stdout = PIPE).stdout
55
+
56
+ line = ''
57
+ while True:
58
+ last_line = line
59
+ line = f.readline()
60
+ if not line: break
61
+ c,g,rate = map(float,last_line.split())
62
+
63
+ print('Best c=%s, g=%s CV rate=%s' % (c,g,rate))
64
+
65
+ cmd = '%s -c %s -g %s "%s" "%s"' % (svmtrain_exe,c,g,scaled_file,model_file)
66
+ print('Training...')
67
+ Popen(cmd, shell = True, stdout = PIPE).communicate()
68
+
69
+ print('Output model: %s' % model_file)
70
+ if len(sys.argv) > 2:
71
+ cmd = '%s -r "%s" "%s" > "%s"' % (svmscale_exe, range_file, test_pathname, scaled_test_file)
72
+ print('Scaling testing data...')
73
+ Popen(cmd, shell = True, stdout = PIPE).communicate()
74
+
75
+ cmd = '%s "%s" "%s" "%s"' % (svmpredict_exe, scaled_test_file, model_file, predict_test_file)
76
+ print('Testing...')
77
+ Popen(cmd, shell = True).communicate()
78
+
79
+ print('Output prediction: %s' % predict_test_file)