eluka 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. data/.document +5 -0
  2. data/DOCUMENTATION_STANDARDS +39 -0
  3. data/Gemfile +13 -0
  4. data/Gemfile.lock +20 -0
  5. data/LICENSE.txt +20 -0
  6. data/README.rdoc +19 -0
  7. data/Rakefile +69 -0
  8. data/VERSION +1 -0
  9. data/examples/example.rb +59 -0
  10. data/ext/libsvm/COPYRIGHT +31 -0
  11. data/ext/libsvm/FAQ.html +1749 -0
  12. data/ext/libsvm/Makefile +25 -0
  13. data/ext/libsvm/Makefile.win +33 -0
  14. data/ext/libsvm/README +733 -0
  15. data/ext/libsvm/extconf.rb +1 -0
  16. data/ext/libsvm/heart_scale +270 -0
  17. data/ext/libsvm/java/Makefile +25 -0
  18. data/ext/libsvm/java/libsvm.jar +0 -0
  19. data/ext/libsvm/java/libsvm/svm.java +2776 -0
  20. data/ext/libsvm/java/libsvm/svm.m4 +2776 -0
  21. data/ext/libsvm/java/libsvm/svm_model.java +21 -0
  22. data/ext/libsvm/java/libsvm/svm_node.java +6 -0
  23. data/ext/libsvm/java/libsvm/svm_parameter.java +47 -0
  24. data/ext/libsvm/java/libsvm/svm_print_interface.java +5 -0
  25. data/ext/libsvm/java/libsvm/svm_problem.java +7 -0
  26. data/ext/libsvm/java/svm_predict.java +163 -0
  27. data/ext/libsvm/java/svm_scale.java +350 -0
  28. data/ext/libsvm/java/svm_toy.java +471 -0
  29. data/ext/libsvm/java/svm_train.java +318 -0
  30. data/ext/libsvm/java/test_applet.html +1 -0
  31. data/ext/libsvm/python/Makefile +4 -0
  32. data/ext/libsvm/python/README +331 -0
  33. data/ext/libsvm/python/svm.py +259 -0
  34. data/ext/libsvm/python/svmutil.py +242 -0
  35. data/ext/libsvm/svm-predict.c +226 -0
  36. data/ext/libsvm/svm-scale.c +353 -0
  37. data/ext/libsvm/svm-toy/gtk/Makefile +22 -0
  38. data/ext/libsvm/svm-toy/gtk/callbacks.cpp +423 -0
  39. data/ext/libsvm/svm-toy/gtk/callbacks.h +54 -0
  40. data/ext/libsvm/svm-toy/gtk/interface.c +164 -0
  41. data/ext/libsvm/svm-toy/gtk/interface.h +14 -0
  42. data/ext/libsvm/svm-toy/gtk/main.c +23 -0
  43. data/ext/libsvm/svm-toy/gtk/svm-toy.glade +238 -0
  44. data/ext/libsvm/svm-toy/qt/Makefile +17 -0
  45. data/ext/libsvm/svm-toy/qt/svm-toy.cpp +413 -0
  46. data/ext/libsvm/svm-toy/windows/svm-toy.cpp +456 -0
  47. data/ext/libsvm/svm-train.c +376 -0
  48. data/ext/libsvm/svm.cpp +3060 -0
  49. data/ext/libsvm/svm.def +19 -0
  50. data/ext/libsvm/svm.h +105 -0
  51. data/ext/libsvm/svm.o +0 -0
  52. data/ext/libsvm/tools/README +149 -0
  53. data/ext/libsvm/tools/checkdata.py +108 -0
  54. data/ext/libsvm/tools/easy.py +79 -0
  55. data/ext/libsvm/tools/grid.py +359 -0
  56. data/ext/libsvm/tools/subset.py +146 -0
  57. data/ext/libsvm/windows/libsvm.dll +0 -0
  58. data/ext/libsvm/windows/svm-predict.exe +0 -0
  59. data/ext/libsvm/windows/svm-scale.exe +0 -0
  60. data/ext/libsvm/windows/svm-toy.exe +0 -0
  61. data/ext/libsvm/windows/svm-train.exe +0 -0
  62. data/lib/eluka.rb +10 -0
  63. data/lib/eluka/bijection.rb +23 -0
  64. data/lib/eluka/data_point.rb +36 -0
  65. data/lib/eluka/document.rb +47 -0
  66. data/lib/eluka/feature_vector.rb +86 -0
  67. data/lib/eluka/features.rb +31 -0
  68. data/lib/eluka/model.rb +129 -0
  69. data/lib/fselect.rb +321 -0
  70. data/lib/grid.rb +25 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_eluka.rb +7 -0
  73. metadata +214 -0
@@ -0,0 +1,19 @@
1
+ LIBRARY libsvm
2
+ EXPORTS
3
+ svm_train @1
4
+ svm_cross_validation @2
5
+ svm_save_model @3
6
+ svm_load_model @4
7
+ svm_get_svm_type @5
8
+ svm_get_nr_class @6
9
+ svm_get_labels @7
10
+ svm_get_svr_probability @8
11
+ svm_predict_values @9
12
+ svm_predict @10
13
+ svm_predict_probability @11
14
+ svm_free_model_content @12
15
+ svm_free_and_destroy_model @13
16
+ svm_destroy_param @14
17
+ svm_check_parameter @15
18
+ svm_check_probability_model @16
19
+ svm_set_print_string_function @17
data/ext/libsvm/svm.h ADDED
@@ -0,0 +1,105 @@
1
+ #ifndef _LIBSVM_H
2
+ #define _LIBSVM_H
3
+
4
+ #define LIBSVM_VERSION 300
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ extern int libsvm_version;
11
+
12
+ struct svm_node
13
+ {
14
+ int index;
15
+ double value;
16
+ };
17
+
18
+ struct svm_problem
19
+ {
20
+ int l;
21
+ double *y;
22
+ struct svm_node **x;
23
+ };
24
+
25
+ enum { C_SVC, NU_SVC, ONE_CLASS, EPSILON_SVR, NU_SVR }; /* svm_type */
26
+ enum { LINEAR, POLY, RBF, SIGMOID, PRECOMPUTED }; /* kernel_type */
27
+
28
+ struct svm_parameter
29
+ {
30
+ int svm_type;
31
+ int kernel_type;
32
+ int degree; /* for poly */
33
+ double gamma; /* for poly/rbf/sigmoid */
34
+ double coef0; /* for poly/sigmoid */
35
+
36
+ /* these are for training only */
37
+ double cache_size; /* in MB */
38
+ double eps; /* stopping criteria */
39
+ double C; /* for C_SVC, EPSILON_SVR and NU_SVR */
40
+ int nr_weight; /* for C_SVC */
41
+ int *weight_label; /* for C_SVC */
42
+ double* weight; /* for C_SVC */
43
+ double nu; /* for NU_SVC, ONE_CLASS, and NU_SVR */
44
+ double p; /* for EPSILON_SVR */
45
+ int shrinking; /* use the shrinking heuristics */
46
+ int probability; /* do probability estimates */
47
+ };
48
+
49
+ //
50
+ // svm_model
51
+ //
52
+ struct svm_model
53
+ {
54
+ struct svm_parameter param; /* parameter */
55
+ int nr_class; /* number of classes, = 2 in regression/one class svm */
56
+ int l; /* total #SV */
57
+ struct svm_node **SV; /* SVs (SV[l]) */
58
+ double **sv_coef; /* coefficients for SVs in decision functions (sv_coef[k-1][l]) */
59
+ double *rho; /* constants in decision functions (rho[k*(k-1)/2]) */
60
+ double *probA; /* pariwise probability information */
61
+ double *probB;
62
+
63
+ /* for classification only */
64
+
65
+ int *label; /* label of each class (label[k]) */
66
+ int *nSV; /* number of SVs for each class (nSV[k]) */
67
+ /* nSV[0] + nSV[1] + ... + nSV[k-1] = l */
68
+ /* XXX */
69
+ int free_sv; /* 1 if svm_model is created by svm_load_model*/
70
+ /* 0 if svm_model is created by svm_train */
71
+ };
72
+
73
+ struct svm_model *svm_train(const struct svm_problem *prob, const struct svm_parameter *param);
74
+ void svm_cross_validation(const struct svm_problem *prob, const struct svm_parameter *param, int nr_fold, double *target);
75
+
76
+ int svm_save_model(const char *model_file_name, const struct svm_model *model);
77
+ struct svm_model *svm_load_model(const char *model_file_name);
78
+
79
+ int svm_get_svm_type(const struct svm_model *model);
80
+ int svm_get_nr_class(const struct svm_model *model);
81
+ void svm_get_labels(const struct svm_model *model, int *label);
82
+ double svm_get_svr_probability(const struct svm_model *model);
83
+
84
+ double svm_predict_values(const struct svm_model *model, const struct svm_node *x, double* dec_values);
85
+ double svm_predict(const struct svm_model *model, const struct svm_node *x);
86
+ double svm_predict_probability(const struct svm_model *model, const struct svm_node *x, double* prob_estimates);
87
+
88
+ void svm_free_model_content(struct svm_model *model_ptr);
89
+ void svm_free_and_destroy_model(struct svm_model **model_ptr_ptr);
90
+ void svm_destroy_param(struct svm_parameter *param);
91
+
92
+ const char *svm_check_parameter(const struct svm_problem *prob, const struct svm_parameter *param);
93
+ int svm_check_probability_model(const struct svm_model *model);
94
+
95
+ void svm_set_print_string_function(void (*print_func)(const char *));
96
+
97
+ // deprecated
98
+ // this function will be removed in future release
99
+ void svm_destroy_model(struct svm_model *model_ptr);
100
+
101
+ #ifdef __cplusplus
102
+ }
103
+ #endif
104
+
105
+ #endif /* _LIBSVM_H */
data/ext/libsvm/svm.o ADDED
Binary file
@@ -0,0 +1,149 @@
1
+ This directory includes some useful codes:
2
+
3
+ 1. subset selection tools.
4
+ 2. parameter selection tools.
5
+ 3. LIBSVM format checking tools
6
+
7
+ Part I: Subset selection tools
8
+
9
+ Introduction
10
+ ============
11
+
12
+ Training large data is time consuming. Sometimes one should work on a
13
+ smaller subset first. The python script subset.py randomly selects a
14
+ specified number of samples. For classification data, we provide a
15
+ stratified selection to ensure the same class distribution in the
16
+ subset.
17
+
18
+ Usage: subset.py [options] dataset number [output1] [output2]
19
+
20
+ This script selects a subset of the given data set.
21
+
22
+ options:
23
+ -s method : method of selection (default 0)
24
+ 0 -- stratified selection (classification only)
25
+ 1 -- random selection
26
+
27
+ output1 : the subset (optional)
28
+ output2 : the rest of data (optional)
29
+
30
+ If output1 is omitted, the subset will be printed on the screen.
31
+
32
+ Example
33
+ =======
34
+
35
+ > python subset.py heart_scale 100 file1 file2
36
+
37
+ From heart_scale 100 samples are randomly selected and stored in
38
+ file1. All remaining instances are stored in file2.
39
+
40
+
41
+ Part II: Parameter Selection Tools
42
+
43
+ Introduction
44
+ ============
45
+
46
+ grid.py is a parameter selection tool for C-SVM classification using
47
+ the RBF (radial basis function) kernel. It uses cross validation (CV)
48
+ technique to estimate the accuracy of each parameter combination in
49
+ the specified range and helps you to decide the best parameters for
50
+ your problem.
51
+
52
+ grid.py directly executes libsvm binaries (so no python binding is needed)
53
+ for cross validation and then draw contour of CV accuracy using gnuplot.
54
+ You must have libsvm and gnuplot installed before using it. The package
55
+ gnuplot is available at http://www.gnuplot.info/
56
+
57
+ On Mac OSX, the precompiled gnuplot file needs the library Aquarterm,
58
+ which thus must be installed as well. In addition, this version of
59
+ gnuplot does not support png, so you need to change "set term png
60
+ transparent small" and use other image formats. For example, you may
61
+ have "set term pbm small color".
62
+
63
+ Usage: grid.py [-log2c begin,end,step] [-log2g begin,end,step] [-v fold]
64
+ [-svmtrain pathname] [-gnuplot pathname] [-out pathname] [-png pathname]
65
+ [additional parameters for svm-train] dataset
66
+
67
+ The program conducts v-fold cross validation using parameter C (and gamma)
68
+ = 2^begin, 2^(begin+step), ..., 2^end.
69
+
70
+ You can specify where the libsvm executable and gnuplot are using the
71
+ -svmtrain and -gnuplot parameters.
72
+
73
+ For windows users, please use pgnuplot.exe. If you are using gnuplot
74
+ 3.7.1, please upgrade to version 3.7.3 or higher. The version 3.7.1
75
+ has a bug. If you use cygwin on windows, please use gunplot-x11.
76
+
77
+ Example
78
+ =======
79
+
80
+ > python grid.py -log2c -5,5,1 -log2g -4,0,1 -v 5 -m 300 heart_scale
81
+
82
+ Users (in particular MS Windows users) may need to specify the path of
83
+ executable files. You can either change paths in the beginning of
84
+ grid.py or specify them in the command line. For example,
85
+
86
+ > grid.py -log2c -5,5,1 -svmtrain c:\libsvm\windows\svm-train.exe -gnuplot c:\tmp\gnuplot\bin\pgnuplot.exe -v 10 heart_scale
87
+
88
+ Output: two files
89
+ dataset.png: the CV accuracy contour plot generated by gnuplot
90
+ dataset.out: the CV accuracy at each (log2(C),log2(gamma))
91
+
92
+ Parallel grid search
93
+ ====================
94
+
95
+ You can conduct a parallel grid search by dispatching jobs to a
96
+ cluster of computers which share the same file system. First, you add
97
+ machine names in grid.py:
98
+
99
+ ssh_workers = ["linux1", "linux5", "linux5"]
100
+
101
+ and then setup your ssh so that the authentication works without
102
+ asking a password.
103
+
104
+ The same machine (e.g., linux5 here) can be listed more than once if
105
+ it has multiple CPUs or has more RAM. If the local machine is the
106
+ best, you can also enlarge the nr_local_worker. For example:
107
+
108
+ nr_local_worker = 2
109
+
110
+ Example:
111
+
112
+ > python grid.py heart_scale
113
+ [local] -1 -1 78.8889 (best c=0.5, g=0.5, rate=78.8889)
114
+ [linux5] -1 -7 83.3333 (best c=0.5, g=0.0078125, rate=83.3333)
115
+ [linux5] 5 -1 77.037 (best c=0.5, g=0.0078125, rate=83.3333)
116
+ [linux1] 5 -7 83.3333 (best c=0.5, g=0.0078125, rate=83.3333)
117
+ .
118
+ .
119
+ .
120
+
121
+ If -log2c, -log2g, or -v is not specified, default values are used.
122
+
123
+ If your system uses telnet instead of ssh, you list the computer names
124
+ in telnet_workers.
125
+
126
+ Part III: LIBSVM format checking tools
127
+
128
+ Introduction
129
+ ============
130
+
131
+ `svm-train' conducts only a simple check of the input data. To do a
132
+ detailed check, we provide a python script `checkdata.py.'
133
+
134
+ Usage: checkdata.py dataset
135
+
136
+ Exit status (returned value): 1 if there are errors, 0 otherwise.
137
+
138
+ This tool is written by Rong-En Fan at National Taiwan University.
139
+
140
+ Example
141
+ =======
142
+
143
+ > cat bad_data
144
+ 1 3:1 2:4
145
+ > python checkdata.py bad_data
146
+ line 1: feature indices must be in an ascending order, previous/current features 3:1 2:4
147
+ Found 1 lines with error.
148
+
149
+
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/env python
2
+
3
+ #
4
+ # A format checker for LIBSVM
5
+ #
6
+
7
+ #
8
+ # Copyright (c) 2007, Rong-En Fan
9
+ #
10
+ # All rights reserved.
11
+ #
12
+ # This program is distributed under the same license of the LIBSVM package.
13
+ #
14
+
15
+ from sys import argv, exit
16
+ import os.path
17
+
18
+ def err(line_no, msg):
19
+ print("line %d: %s" % (line_no, msg))
20
+
21
+ # works like float() but does not accept nan and inf
22
+ def my_float(x):
23
+ if x.lower().find("nan") != -1 or x.lower().find("inf") != -1:
24
+ raise ValueError
25
+
26
+ return float(x)
27
+
28
+ def main():
29
+ if len(argv) != 2:
30
+ print("Usage: %s dataset" % (argv[0]))
31
+ exit(1)
32
+
33
+ dataset = argv[1]
34
+
35
+ if not os.path.exists(dataset):
36
+ print("dataset %s not found" % (dataset))
37
+ exit(1)
38
+
39
+ line_no = 1
40
+ error_line_count = 0
41
+ for line in open(dataset, 'r'):
42
+ line_error = False
43
+
44
+ # each line must end with a newline character
45
+ if line[-1] != '\n':
46
+ err(line_no, "missing a newline character in the end")
47
+ line_error = True
48
+
49
+ nodes = line.split()
50
+
51
+ # check label
52
+ try:
53
+ label = nodes.pop(0)
54
+
55
+ if label.find(',') != -1:
56
+ # multi-label format
57
+ try:
58
+ for l in label.split(','):
59
+ l = my_float(l)
60
+ except:
61
+ err(line_no, "label %s is not a valid multi-label form" % label)
62
+ line_error = True
63
+ else:
64
+ try:
65
+ label = my_float(label)
66
+ except:
67
+ err(line_no, "label %s is not a number" % label)
68
+ line_error = True
69
+ except:
70
+ err(line_no, "missing label, perhaps an empty line?")
71
+ line_error = True
72
+
73
+ # check features
74
+ prev_index = -1
75
+ for i in range(len(nodes)):
76
+ try:
77
+ (index, value) = nodes[i].split(':')
78
+
79
+ index = int(index)
80
+ value = my_float(value)
81
+
82
+ # precomputed kernel's index starts from 0 and LIBSVM
83
+ # checks it. Hence, don't treat index 0 as an error.
84
+ if index < 0:
85
+ err(line_no, "feature index must be positive; wrong feature %s" % nodes[i])
86
+ line_error = True
87
+ elif index < prev_index:
88
+ err(line_no, "feature indices must be in an ascending order, previous/current features %s %s" % (nodes[i-1], nodes[i]))
89
+ line_error = True
90
+ prev_index = index
91
+ except:
92
+ err(line_no, "feature '%s' not an <index>:<value> pair, <index> integer, <value> real number " % nodes[i])
93
+ line_error = True
94
+
95
+ line_no += 1
96
+
97
+ if line_error:
98
+ error_line_count += 1
99
+
100
+ if error_line_count > 0:
101
+ print("Found %d lines with error." % (error_line_count))
102
+ return 1
103
+ else:
104
+ print("No error.")
105
+ return 0
106
+
107
+ if __name__ == "__main__":
108
+ exit(main())
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ import os
5
+ from subprocess import *
6
+
7
+ if len(sys.argv) <= 1:
8
+ print('Usage: %s training_file [testing_file]' % sys.argv[0])
9
+ raise SystemExit
10
+
11
+ # svm, grid, and gnuplot executable files
12
+
13
+ is_win32 = (sys.platform == 'win32')
14
+ if not is_win32:
15
+ svmscale_exe = "../svm-scale"
16
+ svmtrain_exe = "../svm-train"
17
+ svmpredict_exe = "../svm-predict"
18
+ grid_py = "./grid.py"
19
+ gnuplot_exe = "/usr/bin/gnuplot"
20
+ else:
21
+ # example for windows
22
+ svmscale_exe = r"..\windows\svm-scale.exe"
23
+ svmtrain_exe = r"..\windows\svm-train.exe"
24
+ svmpredict_exe = r"..\windows\svm-predict.exe"
25
+ gnuplot_exe = r"c:\tmp\gnuplot\bin\pgnuplot.exe"
26
+ grid_py = r".\grid.py"
27
+
28
+ assert os.path.exists(svmscale_exe),"svm-scale executable not found"
29
+ assert os.path.exists(svmtrain_exe),"svm-train executable not found"
30
+ assert os.path.exists(svmpredict_exe),"svm-predict executable not found"
31
+ assert os.path.exists(gnuplot_exe),"gnuplot executable not found"
32
+ assert os.path.exists(grid_py),"grid.py not found"
33
+
34
+ train_pathname = sys.argv[1]
35
+ assert os.path.exists(train_pathname),"training file not found"
36
+ file_name = os.path.split(train_pathname)[1]
37
+ scaled_file = file_name + ".scale"
38
+ model_file = file_name + ".model"
39
+ range_file = file_name + ".range"
40
+
41
+ if len(sys.argv) > 2:
42
+ test_pathname = sys.argv[2]
43
+ file_name = os.path.split(test_pathname)[1]
44
+ assert os.path.exists(test_pathname),"testing file not found"
45
+ scaled_test_file = file_name + ".scale"
46
+ predict_test_file = file_name + ".predict"
47
+
48
+ cmd = '%s -s "%s" "%s" > "%s"' % (svmscale_exe, range_file, train_pathname, scaled_file)
49
+ print('Scaling training data...')
50
+ Popen(cmd, shell = True, stdout = PIPE).communicate()
51
+
52
+ cmd = '%s -svmtrain "%s" -gnuplot "%s" "%s"' % (grid_py, svmtrain_exe, gnuplot_exe, scaled_file)
53
+ print('Cross validation...')
54
+ f = Popen(cmd, shell = True, stdout = PIPE).stdout
55
+
56
+ line = ''
57
+ while True:
58
+ last_line = line
59
+ line = f.readline()
60
+ if not line: break
61
+ c,g,rate = map(float,last_line.split())
62
+
63
+ print('Best c=%s, g=%s CV rate=%s' % (c,g,rate))
64
+
65
+ cmd = '%s -c %s -g %s "%s" "%s"' % (svmtrain_exe,c,g,scaled_file,model_file)
66
+ print('Training...')
67
+ Popen(cmd, shell = True, stdout = PIPE).communicate()
68
+
69
+ print('Output model: %s' % model_file)
70
+ if len(sys.argv) > 2:
71
+ cmd = '%s -r "%s" "%s" > "%s"' % (svmscale_exe, range_file, test_pathname, scaled_test_file)
72
+ print('Scaling testing data...')
73
+ Popen(cmd, shell = True, stdout = PIPE).communicate()
74
+
75
+ cmd = '%s "%s" "%s" "%s"' % (svmpredict_exe, scaled_test_file, model_file, predict_test_file)
76
+ print('Testing...')
77
+ Popen(cmd, shell = True).communicate()
78
+
79
+ print('Output prediction: %s' % predict_test_file)