eluka 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/DOCUMENTATION_STANDARDS +39 -0
- data/Gemfile +13 -0
- data/Gemfile.lock +20 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +69 -0
- data/VERSION +1 -0
- data/examples/example.rb +59 -0
- data/ext/libsvm/COPYRIGHT +31 -0
- data/ext/libsvm/FAQ.html +1749 -0
- data/ext/libsvm/Makefile +25 -0
- data/ext/libsvm/Makefile.win +33 -0
- data/ext/libsvm/README +733 -0
- data/ext/libsvm/extconf.rb +1 -0
- data/ext/libsvm/heart_scale +270 -0
- data/ext/libsvm/java/Makefile +25 -0
- data/ext/libsvm/java/libsvm.jar +0 -0
- data/ext/libsvm/java/libsvm/svm.java +2776 -0
- data/ext/libsvm/java/libsvm/svm.m4 +2776 -0
- data/ext/libsvm/java/libsvm/svm_model.java +21 -0
- data/ext/libsvm/java/libsvm/svm_node.java +6 -0
- data/ext/libsvm/java/libsvm/svm_parameter.java +47 -0
- data/ext/libsvm/java/libsvm/svm_print_interface.java +5 -0
- data/ext/libsvm/java/libsvm/svm_problem.java +7 -0
- data/ext/libsvm/java/svm_predict.java +163 -0
- data/ext/libsvm/java/svm_scale.java +350 -0
- data/ext/libsvm/java/svm_toy.java +471 -0
- data/ext/libsvm/java/svm_train.java +318 -0
- data/ext/libsvm/java/test_applet.html +1 -0
- data/ext/libsvm/python/Makefile +4 -0
- data/ext/libsvm/python/README +331 -0
- data/ext/libsvm/python/svm.py +259 -0
- data/ext/libsvm/python/svmutil.py +242 -0
- data/ext/libsvm/svm-predict.c +226 -0
- data/ext/libsvm/svm-scale.c +353 -0
- data/ext/libsvm/svm-toy/gtk/Makefile +22 -0
- data/ext/libsvm/svm-toy/gtk/callbacks.cpp +423 -0
- data/ext/libsvm/svm-toy/gtk/callbacks.h +54 -0
- data/ext/libsvm/svm-toy/gtk/interface.c +164 -0
- data/ext/libsvm/svm-toy/gtk/interface.h +14 -0
- data/ext/libsvm/svm-toy/gtk/main.c +23 -0
- data/ext/libsvm/svm-toy/gtk/svm-toy.glade +238 -0
- data/ext/libsvm/svm-toy/qt/Makefile +17 -0
- data/ext/libsvm/svm-toy/qt/svm-toy.cpp +413 -0
- data/ext/libsvm/svm-toy/windows/svm-toy.cpp +456 -0
- data/ext/libsvm/svm-train.c +376 -0
- data/ext/libsvm/svm.cpp +3060 -0
- data/ext/libsvm/svm.def +19 -0
- data/ext/libsvm/svm.h +105 -0
- data/ext/libsvm/svm.o +0 -0
- data/ext/libsvm/tools/README +149 -0
- data/ext/libsvm/tools/checkdata.py +108 -0
- data/ext/libsvm/tools/easy.py +79 -0
- data/ext/libsvm/tools/grid.py +359 -0
- data/ext/libsvm/tools/subset.py +146 -0
- data/ext/libsvm/windows/libsvm.dll +0 -0
- data/ext/libsvm/windows/svm-predict.exe +0 -0
- data/ext/libsvm/windows/svm-scale.exe +0 -0
- data/ext/libsvm/windows/svm-toy.exe +0 -0
- data/ext/libsvm/windows/svm-train.exe +0 -0
- data/lib/eluka.rb +10 -0
- data/lib/eluka/bijection.rb +23 -0
- data/lib/eluka/data_point.rb +36 -0
- data/lib/eluka/document.rb +47 -0
- data/lib/eluka/feature_vector.rb +86 -0
- data/lib/eluka/features.rb +31 -0
- data/lib/eluka/model.rb +129 -0
- data/lib/fselect.rb +321 -0
- data/lib/grid.rb +25 -0
- data/test/helper.rb +18 -0
- data/test/test_eluka.rb +7 -0
- metadata +214 -0
data/ext/libsvm/svm.def
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
LIBRARY libsvm
|
2
|
+
EXPORTS
|
3
|
+
svm_train @1
|
4
|
+
svm_cross_validation @2
|
5
|
+
svm_save_model @3
|
6
|
+
svm_load_model @4
|
7
|
+
svm_get_svm_type @5
|
8
|
+
svm_get_nr_class @6
|
9
|
+
svm_get_labels @7
|
10
|
+
svm_get_svr_probability @8
|
11
|
+
svm_predict_values @9
|
12
|
+
svm_predict @10
|
13
|
+
svm_predict_probability @11
|
14
|
+
svm_free_model_content @12
|
15
|
+
svm_free_and_destroy_model @13
|
16
|
+
svm_destroy_param @14
|
17
|
+
svm_check_parameter @15
|
18
|
+
svm_check_probability_model @16
|
19
|
+
svm_set_print_string_function @17
|
data/ext/libsvm/svm.h
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
#ifndef _LIBSVM_H
|
2
|
+
#define _LIBSVM_H
|
3
|
+
|
4
|
+
#define LIBSVM_VERSION 300
|
5
|
+
|
6
|
+
#ifdef __cplusplus
|
7
|
+
extern "C" {
|
8
|
+
#endif
|
9
|
+
|
10
|
+
extern int libsvm_version;
|
11
|
+
|
12
|
+
struct svm_node
|
13
|
+
{
|
14
|
+
int index;
|
15
|
+
double value;
|
16
|
+
};
|
17
|
+
|
18
|
+
struct svm_problem
|
19
|
+
{
|
20
|
+
int l;
|
21
|
+
double *y;
|
22
|
+
struct svm_node **x;
|
23
|
+
};
|
24
|
+
|
25
|
+
enum { C_SVC, NU_SVC, ONE_CLASS, EPSILON_SVR, NU_SVR }; /* svm_type */
|
26
|
+
enum { LINEAR, POLY, RBF, SIGMOID, PRECOMPUTED }; /* kernel_type */
|
27
|
+
|
28
|
+
struct svm_parameter
|
29
|
+
{
|
30
|
+
int svm_type;
|
31
|
+
int kernel_type;
|
32
|
+
int degree; /* for poly */
|
33
|
+
double gamma; /* for poly/rbf/sigmoid */
|
34
|
+
double coef0; /* for poly/sigmoid */
|
35
|
+
|
36
|
+
/* these are for training only */
|
37
|
+
double cache_size; /* in MB */
|
38
|
+
double eps; /* stopping criteria */
|
39
|
+
double C; /* for C_SVC, EPSILON_SVR and NU_SVR */
|
40
|
+
int nr_weight; /* for C_SVC */
|
41
|
+
int *weight_label; /* for C_SVC */
|
42
|
+
double* weight; /* for C_SVC */
|
43
|
+
double nu; /* for NU_SVC, ONE_CLASS, and NU_SVR */
|
44
|
+
double p; /* for EPSILON_SVR */
|
45
|
+
int shrinking; /* use the shrinking heuristics */
|
46
|
+
int probability; /* do probability estimates */
|
47
|
+
};
|
48
|
+
|
49
|
+
//
|
50
|
+
// svm_model
|
51
|
+
//
|
52
|
+
struct svm_model
|
53
|
+
{
|
54
|
+
struct svm_parameter param; /* parameter */
|
55
|
+
int nr_class; /* number of classes, = 2 in regression/one class svm */
|
56
|
+
int l; /* total #SV */
|
57
|
+
struct svm_node **SV; /* SVs (SV[l]) */
|
58
|
+
double **sv_coef; /* coefficients for SVs in decision functions (sv_coef[k-1][l]) */
|
59
|
+
double *rho; /* constants in decision functions (rho[k*(k-1)/2]) */
|
60
|
+
double *probA; /* pariwise probability information */
|
61
|
+
double *probB;
|
62
|
+
|
63
|
+
/* for classification only */
|
64
|
+
|
65
|
+
int *label; /* label of each class (label[k]) */
|
66
|
+
int *nSV; /* number of SVs for each class (nSV[k]) */
|
67
|
+
/* nSV[0] + nSV[1] + ... + nSV[k-1] = l */
|
68
|
+
/* XXX */
|
69
|
+
int free_sv; /* 1 if svm_model is created by svm_load_model*/
|
70
|
+
/* 0 if svm_model is created by svm_train */
|
71
|
+
};
|
72
|
+
|
73
|
+
struct svm_model *svm_train(const struct svm_problem *prob, const struct svm_parameter *param);
|
74
|
+
void svm_cross_validation(const struct svm_problem *prob, const struct svm_parameter *param, int nr_fold, double *target);
|
75
|
+
|
76
|
+
int svm_save_model(const char *model_file_name, const struct svm_model *model);
|
77
|
+
struct svm_model *svm_load_model(const char *model_file_name);
|
78
|
+
|
79
|
+
int svm_get_svm_type(const struct svm_model *model);
|
80
|
+
int svm_get_nr_class(const struct svm_model *model);
|
81
|
+
void svm_get_labels(const struct svm_model *model, int *label);
|
82
|
+
double svm_get_svr_probability(const struct svm_model *model);
|
83
|
+
|
84
|
+
double svm_predict_values(const struct svm_model *model, const struct svm_node *x, double* dec_values);
|
85
|
+
double svm_predict(const struct svm_model *model, const struct svm_node *x);
|
86
|
+
double svm_predict_probability(const struct svm_model *model, const struct svm_node *x, double* prob_estimates);
|
87
|
+
|
88
|
+
void svm_free_model_content(struct svm_model *model_ptr);
|
89
|
+
void svm_free_and_destroy_model(struct svm_model **model_ptr_ptr);
|
90
|
+
void svm_destroy_param(struct svm_parameter *param);
|
91
|
+
|
92
|
+
const char *svm_check_parameter(const struct svm_problem *prob, const struct svm_parameter *param);
|
93
|
+
int svm_check_probability_model(const struct svm_model *model);
|
94
|
+
|
95
|
+
void svm_set_print_string_function(void (*print_func)(const char *));
|
96
|
+
|
97
|
+
// deprecated
|
98
|
+
// this function will be removed in future release
|
99
|
+
void svm_destroy_model(struct svm_model *model_ptr);
|
100
|
+
|
101
|
+
#ifdef __cplusplus
|
102
|
+
}
|
103
|
+
#endif
|
104
|
+
|
105
|
+
#endif /* _LIBSVM_H */
|
data/ext/libsvm/svm.o
ADDED
Binary file
|
@@ -0,0 +1,149 @@
|
|
1
|
+
This directory includes some useful codes:
|
2
|
+
|
3
|
+
1. subset selection tools.
|
4
|
+
2. parameter selection tools.
|
5
|
+
3. LIBSVM format checking tools
|
6
|
+
|
7
|
+
Part I: Subset selection tools
|
8
|
+
|
9
|
+
Introduction
|
10
|
+
============
|
11
|
+
|
12
|
+
Training large data is time consuming. Sometimes one should work on a
|
13
|
+
smaller subset first. The python script subset.py randomly selects a
|
14
|
+
specified number of samples. For classification data, we provide a
|
15
|
+
stratified selection to ensure the same class distribution in the
|
16
|
+
subset.
|
17
|
+
|
18
|
+
Usage: subset.py [options] dataset number [output1] [output2]
|
19
|
+
|
20
|
+
This script selects a subset of the given data set.
|
21
|
+
|
22
|
+
options:
|
23
|
+
-s method : method of selection (default 0)
|
24
|
+
0 -- stratified selection (classification only)
|
25
|
+
1 -- random selection
|
26
|
+
|
27
|
+
output1 : the subset (optional)
|
28
|
+
output2 : the rest of data (optional)
|
29
|
+
|
30
|
+
If output1 is omitted, the subset will be printed on the screen.
|
31
|
+
|
32
|
+
Example
|
33
|
+
=======
|
34
|
+
|
35
|
+
> python subset.py heart_scale 100 file1 file2
|
36
|
+
|
37
|
+
From heart_scale 100 samples are randomly selected and stored in
|
38
|
+
file1. All remaining instances are stored in file2.
|
39
|
+
|
40
|
+
|
41
|
+
Part II: Parameter Selection Tools
|
42
|
+
|
43
|
+
Introduction
|
44
|
+
============
|
45
|
+
|
46
|
+
grid.py is a parameter selection tool for C-SVM classification using
|
47
|
+
the RBF (radial basis function) kernel. It uses cross validation (CV)
|
48
|
+
technique to estimate the accuracy of each parameter combination in
|
49
|
+
the specified range and helps you to decide the best parameters for
|
50
|
+
your problem.
|
51
|
+
|
52
|
+
grid.py directly executes libsvm binaries (so no python binding is needed)
|
53
|
+
for cross validation and then draw contour of CV accuracy using gnuplot.
|
54
|
+
You must have libsvm and gnuplot installed before using it. The package
|
55
|
+
gnuplot is available at http://www.gnuplot.info/
|
56
|
+
|
57
|
+
On Mac OSX, the precompiled gnuplot file needs the library Aquarterm,
|
58
|
+
which thus must be installed as well. In addition, this version of
|
59
|
+
gnuplot does not support png, so you need to change "set term png
|
60
|
+
transparent small" and use other image formats. For example, you may
|
61
|
+
have "set term pbm small color".
|
62
|
+
|
63
|
+
Usage: grid.py [-log2c begin,end,step] [-log2g begin,end,step] [-v fold]
|
64
|
+
[-svmtrain pathname] [-gnuplot pathname] [-out pathname] [-png pathname]
|
65
|
+
[additional parameters for svm-train] dataset
|
66
|
+
|
67
|
+
The program conducts v-fold cross validation using parameter C (and gamma)
|
68
|
+
= 2^begin, 2^(begin+step), ..., 2^end.
|
69
|
+
|
70
|
+
You can specify where the libsvm executable and gnuplot are using the
|
71
|
+
-svmtrain and -gnuplot parameters.
|
72
|
+
|
73
|
+
For windows users, please use pgnuplot.exe. If you are using gnuplot
|
74
|
+
3.7.1, please upgrade to version 3.7.3 or higher. The version 3.7.1
|
75
|
+
has a bug. If you use cygwin on windows, please use gunplot-x11.
|
76
|
+
|
77
|
+
Example
|
78
|
+
=======
|
79
|
+
|
80
|
+
> python grid.py -log2c -5,5,1 -log2g -4,0,1 -v 5 -m 300 heart_scale
|
81
|
+
|
82
|
+
Users (in particular MS Windows users) may need to specify the path of
|
83
|
+
executable files. You can either change paths in the beginning of
|
84
|
+
grid.py or specify them in the command line. For example,
|
85
|
+
|
86
|
+
> grid.py -log2c -5,5,1 -svmtrain c:\libsvm\windows\svm-train.exe -gnuplot c:\tmp\gnuplot\bin\pgnuplot.exe -v 10 heart_scale
|
87
|
+
|
88
|
+
Output: two files
|
89
|
+
dataset.png: the CV accuracy contour plot generated by gnuplot
|
90
|
+
dataset.out: the CV accuracy at each (log2(C),log2(gamma))
|
91
|
+
|
92
|
+
Parallel grid search
|
93
|
+
====================
|
94
|
+
|
95
|
+
You can conduct a parallel grid search by dispatching jobs to a
|
96
|
+
cluster of computers which share the same file system. First, you add
|
97
|
+
machine names in grid.py:
|
98
|
+
|
99
|
+
ssh_workers = ["linux1", "linux5", "linux5"]
|
100
|
+
|
101
|
+
and then setup your ssh so that the authentication works without
|
102
|
+
asking a password.
|
103
|
+
|
104
|
+
The same machine (e.g., linux5 here) can be listed more than once if
|
105
|
+
it has multiple CPUs or has more RAM. If the local machine is the
|
106
|
+
best, you can also enlarge the nr_local_worker. For example:
|
107
|
+
|
108
|
+
nr_local_worker = 2
|
109
|
+
|
110
|
+
Example:
|
111
|
+
|
112
|
+
> python grid.py heart_scale
|
113
|
+
[local] -1 -1 78.8889 (best c=0.5, g=0.5, rate=78.8889)
|
114
|
+
[linux5] -1 -7 83.3333 (best c=0.5, g=0.0078125, rate=83.3333)
|
115
|
+
[linux5] 5 -1 77.037 (best c=0.5, g=0.0078125, rate=83.3333)
|
116
|
+
[linux1] 5 -7 83.3333 (best c=0.5, g=0.0078125, rate=83.3333)
|
117
|
+
.
|
118
|
+
.
|
119
|
+
.
|
120
|
+
|
121
|
+
If -log2c, -log2g, or -v is not specified, default values are used.
|
122
|
+
|
123
|
+
If your system uses telnet instead of ssh, you list the computer names
|
124
|
+
in telnet_workers.
|
125
|
+
|
126
|
+
Part III: LIBSVM format checking tools
|
127
|
+
|
128
|
+
Introduction
|
129
|
+
============
|
130
|
+
|
131
|
+
`svm-train' conducts only a simple check of the input data. To do a
|
132
|
+
detailed check, we provide a python script `checkdata.py.'
|
133
|
+
|
134
|
+
Usage: checkdata.py dataset
|
135
|
+
|
136
|
+
Exit status (returned value): 1 if there are errors, 0 otherwise.
|
137
|
+
|
138
|
+
This tool is written by Rong-En Fan at National Taiwan University.
|
139
|
+
|
140
|
+
Example
|
141
|
+
=======
|
142
|
+
|
143
|
+
> cat bad_data
|
144
|
+
1 3:1 2:4
|
145
|
+
> python checkdata.py bad_data
|
146
|
+
line 1: feature indices must be in an ascending order, previous/current features 3:1 2:4
|
147
|
+
Found 1 lines with error.
|
148
|
+
|
149
|
+
|
@@ -0,0 +1,108 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
#
|
4
|
+
# A format checker for LIBSVM
|
5
|
+
#
|
6
|
+
|
7
|
+
#
|
8
|
+
# Copyright (c) 2007, Rong-En Fan
|
9
|
+
#
|
10
|
+
# All rights reserved.
|
11
|
+
#
|
12
|
+
# This program is distributed under the same license of the LIBSVM package.
|
13
|
+
#
|
14
|
+
|
15
|
+
from sys import argv, exit
|
16
|
+
import os.path
|
17
|
+
|
18
|
+
def err(line_no, msg):
|
19
|
+
print("line %d: %s" % (line_no, msg))
|
20
|
+
|
21
|
+
# works like float() but does not accept nan and inf
|
22
|
+
def my_float(x):
|
23
|
+
if x.lower().find("nan") != -1 or x.lower().find("inf") != -1:
|
24
|
+
raise ValueError
|
25
|
+
|
26
|
+
return float(x)
|
27
|
+
|
28
|
+
def main():
|
29
|
+
if len(argv) != 2:
|
30
|
+
print("Usage: %s dataset" % (argv[0]))
|
31
|
+
exit(1)
|
32
|
+
|
33
|
+
dataset = argv[1]
|
34
|
+
|
35
|
+
if not os.path.exists(dataset):
|
36
|
+
print("dataset %s not found" % (dataset))
|
37
|
+
exit(1)
|
38
|
+
|
39
|
+
line_no = 1
|
40
|
+
error_line_count = 0
|
41
|
+
for line in open(dataset, 'r'):
|
42
|
+
line_error = False
|
43
|
+
|
44
|
+
# each line must end with a newline character
|
45
|
+
if line[-1] != '\n':
|
46
|
+
err(line_no, "missing a newline character in the end")
|
47
|
+
line_error = True
|
48
|
+
|
49
|
+
nodes = line.split()
|
50
|
+
|
51
|
+
# check label
|
52
|
+
try:
|
53
|
+
label = nodes.pop(0)
|
54
|
+
|
55
|
+
if label.find(',') != -1:
|
56
|
+
# multi-label format
|
57
|
+
try:
|
58
|
+
for l in label.split(','):
|
59
|
+
l = my_float(l)
|
60
|
+
except:
|
61
|
+
err(line_no, "label %s is not a valid multi-label form" % label)
|
62
|
+
line_error = True
|
63
|
+
else:
|
64
|
+
try:
|
65
|
+
label = my_float(label)
|
66
|
+
except:
|
67
|
+
err(line_no, "label %s is not a number" % label)
|
68
|
+
line_error = True
|
69
|
+
except:
|
70
|
+
err(line_no, "missing label, perhaps an empty line?")
|
71
|
+
line_error = True
|
72
|
+
|
73
|
+
# check features
|
74
|
+
prev_index = -1
|
75
|
+
for i in range(len(nodes)):
|
76
|
+
try:
|
77
|
+
(index, value) = nodes[i].split(':')
|
78
|
+
|
79
|
+
index = int(index)
|
80
|
+
value = my_float(value)
|
81
|
+
|
82
|
+
# precomputed kernel's index starts from 0 and LIBSVM
|
83
|
+
# checks it. Hence, don't treat index 0 as an error.
|
84
|
+
if index < 0:
|
85
|
+
err(line_no, "feature index must be positive; wrong feature %s" % nodes[i])
|
86
|
+
line_error = True
|
87
|
+
elif index < prev_index:
|
88
|
+
err(line_no, "feature indices must be in an ascending order, previous/current features %s %s" % (nodes[i-1], nodes[i]))
|
89
|
+
line_error = True
|
90
|
+
prev_index = index
|
91
|
+
except:
|
92
|
+
err(line_no, "feature '%s' not an <index>:<value> pair, <index> integer, <value> real number " % nodes[i])
|
93
|
+
line_error = True
|
94
|
+
|
95
|
+
line_no += 1
|
96
|
+
|
97
|
+
if line_error:
|
98
|
+
error_line_count += 1
|
99
|
+
|
100
|
+
if error_line_count > 0:
|
101
|
+
print("Found %d lines with error." % (error_line_count))
|
102
|
+
return 1
|
103
|
+
else:
|
104
|
+
print("No error.")
|
105
|
+
return 0
|
106
|
+
|
107
|
+
if __name__ == "__main__":
|
108
|
+
exit(main())
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
import sys
|
4
|
+
import os
|
5
|
+
from subprocess import *
|
6
|
+
|
7
|
+
if len(sys.argv) <= 1:
|
8
|
+
print('Usage: %s training_file [testing_file]' % sys.argv[0])
|
9
|
+
raise SystemExit
|
10
|
+
|
11
|
+
# svm, grid, and gnuplot executable files
|
12
|
+
|
13
|
+
is_win32 = (sys.platform == 'win32')
|
14
|
+
if not is_win32:
|
15
|
+
svmscale_exe = "../svm-scale"
|
16
|
+
svmtrain_exe = "../svm-train"
|
17
|
+
svmpredict_exe = "../svm-predict"
|
18
|
+
grid_py = "./grid.py"
|
19
|
+
gnuplot_exe = "/usr/bin/gnuplot"
|
20
|
+
else:
|
21
|
+
# example for windows
|
22
|
+
svmscale_exe = r"..\windows\svm-scale.exe"
|
23
|
+
svmtrain_exe = r"..\windows\svm-train.exe"
|
24
|
+
svmpredict_exe = r"..\windows\svm-predict.exe"
|
25
|
+
gnuplot_exe = r"c:\tmp\gnuplot\bin\pgnuplot.exe"
|
26
|
+
grid_py = r".\grid.py"
|
27
|
+
|
28
|
+
assert os.path.exists(svmscale_exe),"svm-scale executable not found"
|
29
|
+
assert os.path.exists(svmtrain_exe),"svm-train executable not found"
|
30
|
+
assert os.path.exists(svmpredict_exe),"svm-predict executable not found"
|
31
|
+
assert os.path.exists(gnuplot_exe),"gnuplot executable not found"
|
32
|
+
assert os.path.exists(grid_py),"grid.py not found"
|
33
|
+
|
34
|
+
train_pathname = sys.argv[1]
|
35
|
+
assert os.path.exists(train_pathname),"training file not found"
|
36
|
+
file_name = os.path.split(train_pathname)[1]
|
37
|
+
scaled_file = file_name + ".scale"
|
38
|
+
model_file = file_name + ".model"
|
39
|
+
range_file = file_name + ".range"
|
40
|
+
|
41
|
+
if len(sys.argv) > 2:
|
42
|
+
test_pathname = sys.argv[2]
|
43
|
+
file_name = os.path.split(test_pathname)[1]
|
44
|
+
assert os.path.exists(test_pathname),"testing file not found"
|
45
|
+
scaled_test_file = file_name + ".scale"
|
46
|
+
predict_test_file = file_name + ".predict"
|
47
|
+
|
48
|
+
cmd = '%s -s "%s" "%s" > "%s"' % (svmscale_exe, range_file, train_pathname, scaled_file)
|
49
|
+
print('Scaling training data...')
|
50
|
+
Popen(cmd, shell = True, stdout = PIPE).communicate()
|
51
|
+
|
52
|
+
cmd = '%s -svmtrain "%s" -gnuplot "%s" "%s"' % (grid_py, svmtrain_exe, gnuplot_exe, scaled_file)
|
53
|
+
print('Cross validation...')
|
54
|
+
f = Popen(cmd, shell = True, stdout = PIPE).stdout
|
55
|
+
|
56
|
+
line = ''
|
57
|
+
while True:
|
58
|
+
last_line = line
|
59
|
+
line = f.readline()
|
60
|
+
if not line: break
|
61
|
+
c,g,rate = map(float,last_line.split())
|
62
|
+
|
63
|
+
print('Best c=%s, g=%s CV rate=%s' % (c,g,rate))
|
64
|
+
|
65
|
+
cmd = '%s -c %s -g %s "%s" "%s"' % (svmtrain_exe,c,g,scaled_file,model_file)
|
66
|
+
print('Training...')
|
67
|
+
Popen(cmd, shell = True, stdout = PIPE).communicate()
|
68
|
+
|
69
|
+
print('Output model: %s' % model_file)
|
70
|
+
if len(sys.argv) > 2:
|
71
|
+
cmd = '%s -r "%s" "%s" > "%s"' % (svmscale_exe, range_file, test_pathname, scaled_test_file)
|
72
|
+
print('Scaling testing data...')
|
73
|
+
Popen(cmd, shell = True, stdout = PIPE).communicate()
|
74
|
+
|
75
|
+
cmd = '%s "%s" "%s" "%s"' % (svmpredict_exe, scaled_test_file, model_file, predict_test_file)
|
76
|
+
print('Testing...')
|
77
|
+
Popen(cmd, shell = True).communicate()
|
78
|
+
|
79
|
+
print('Output prediction: %s' % predict_test_file)
|