eluka 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/DOCUMENTATION_STANDARDS +39 -0
- data/Gemfile +13 -0
- data/Gemfile.lock +20 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +69 -0
- data/VERSION +1 -0
- data/examples/example.rb +59 -0
- data/ext/libsvm/COPYRIGHT +31 -0
- data/ext/libsvm/FAQ.html +1749 -0
- data/ext/libsvm/Makefile +25 -0
- data/ext/libsvm/Makefile.win +33 -0
- data/ext/libsvm/README +733 -0
- data/ext/libsvm/extconf.rb +1 -0
- data/ext/libsvm/heart_scale +270 -0
- data/ext/libsvm/java/Makefile +25 -0
- data/ext/libsvm/java/libsvm.jar +0 -0
- data/ext/libsvm/java/libsvm/svm.java +2776 -0
- data/ext/libsvm/java/libsvm/svm.m4 +2776 -0
- data/ext/libsvm/java/libsvm/svm_model.java +21 -0
- data/ext/libsvm/java/libsvm/svm_node.java +6 -0
- data/ext/libsvm/java/libsvm/svm_parameter.java +47 -0
- data/ext/libsvm/java/libsvm/svm_print_interface.java +5 -0
- data/ext/libsvm/java/libsvm/svm_problem.java +7 -0
- data/ext/libsvm/java/svm_predict.java +163 -0
- data/ext/libsvm/java/svm_scale.java +350 -0
- data/ext/libsvm/java/svm_toy.java +471 -0
- data/ext/libsvm/java/svm_train.java +318 -0
- data/ext/libsvm/java/test_applet.html +1 -0
- data/ext/libsvm/python/Makefile +4 -0
- data/ext/libsvm/python/README +331 -0
- data/ext/libsvm/python/svm.py +259 -0
- data/ext/libsvm/python/svmutil.py +242 -0
- data/ext/libsvm/svm-predict.c +226 -0
- data/ext/libsvm/svm-scale.c +353 -0
- data/ext/libsvm/svm-toy/gtk/Makefile +22 -0
- data/ext/libsvm/svm-toy/gtk/callbacks.cpp +423 -0
- data/ext/libsvm/svm-toy/gtk/callbacks.h +54 -0
- data/ext/libsvm/svm-toy/gtk/interface.c +164 -0
- data/ext/libsvm/svm-toy/gtk/interface.h +14 -0
- data/ext/libsvm/svm-toy/gtk/main.c +23 -0
- data/ext/libsvm/svm-toy/gtk/svm-toy.glade +238 -0
- data/ext/libsvm/svm-toy/qt/Makefile +17 -0
- data/ext/libsvm/svm-toy/qt/svm-toy.cpp +413 -0
- data/ext/libsvm/svm-toy/windows/svm-toy.cpp +456 -0
- data/ext/libsvm/svm-train.c +376 -0
- data/ext/libsvm/svm.cpp +3060 -0
- data/ext/libsvm/svm.def +19 -0
- data/ext/libsvm/svm.h +105 -0
- data/ext/libsvm/svm.o +0 -0
- data/ext/libsvm/tools/README +149 -0
- data/ext/libsvm/tools/checkdata.py +108 -0
- data/ext/libsvm/tools/easy.py +79 -0
- data/ext/libsvm/tools/grid.py +359 -0
- data/ext/libsvm/tools/subset.py +146 -0
- data/ext/libsvm/windows/libsvm.dll +0 -0
- data/ext/libsvm/windows/svm-predict.exe +0 -0
- data/ext/libsvm/windows/svm-scale.exe +0 -0
- data/ext/libsvm/windows/svm-toy.exe +0 -0
- data/ext/libsvm/windows/svm-train.exe +0 -0
- data/lib/eluka.rb +10 -0
- data/lib/eluka/bijection.rb +23 -0
- data/lib/eluka/data_point.rb +36 -0
- data/lib/eluka/document.rb +47 -0
- data/lib/eluka/feature_vector.rb +86 -0
- data/lib/eluka/features.rb +31 -0
- data/lib/eluka/model.rb +129 -0
- data/lib/fselect.rb +321 -0
- data/lib/grid.rb +25 -0
- data/test/helper.rb +18 -0
- data/test/test_eluka.rb +7 -0
- metadata +214 -0
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/eluka.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#Creates a two way hash like lookup using two hashes
|
2
|
+
#Source inspired from a post in some forum
|
3
|
+
#Author: Unknown
|
4
|
+
module Eluka
|
5
|
+
class Bijection < Hash
|
6
|
+
def initialize(*args)
|
7
|
+
super(*args)
|
8
|
+
@reverse = self.invert
|
9
|
+
end
|
10
|
+
|
11
|
+
def []=(key, val)
|
12
|
+
super(key, val)
|
13
|
+
if @reverse.has_key?(val)
|
14
|
+
self.delete(@reverse[val])
|
15
|
+
end
|
16
|
+
@reverse[val] = key
|
17
|
+
end
|
18
|
+
|
19
|
+
def lookup(val)
|
20
|
+
@reverse[val]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Eluka
|
2
|
+
class DataPoint
|
3
|
+
|
4
|
+
def initialize(data, analyzer)
|
5
|
+
raise "Can't find any data" unless (data)
|
6
|
+
|
7
|
+
if data.instance_of?(String)
|
8
|
+
data = {:text => data}
|
9
|
+
end
|
10
|
+
|
11
|
+
raise "Invalid data added" unless (data.instance_of? Hash)
|
12
|
+
raise "Data can't be empty" unless (data.size > 0)
|
13
|
+
|
14
|
+
@data = data
|
15
|
+
@analyzer = analyzer
|
16
|
+
end
|
17
|
+
|
18
|
+
def vector
|
19
|
+
vector = Hash.new
|
20
|
+
|
21
|
+
@data.each do |field, value|
|
22
|
+
if value.instance_of?(String) then
|
23
|
+
doc_vec = Eluka::Document.new(field, value, @analyzer).vector
|
24
|
+
vector.merge!(doc_vec)
|
25
|
+
elsif value.instance_of?(Fixnum) or value.instance_of?(Float)
|
26
|
+
vector[field] = value
|
27
|
+
else
|
28
|
+
raise "A field can contain either an integer or a double or it can be a string"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
vector
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
|
2
|
+
module Eluka
|
3
|
+
|
4
|
+
class Document
|
5
|
+
def initialize(field, text, analyzer)
|
6
|
+
@field = field
|
7
|
+
@text = text
|
8
|
+
@analyzer = analyzer
|
9
|
+
@bag_of_words = nil
|
10
|
+
self.bag_of_words
|
11
|
+
end
|
12
|
+
|
13
|
+
def bag_of_words
|
14
|
+
#Position counter for the document
|
15
|
+
pos = 0
|
16
|
+
|
17
|
+
@bag_of_words = Hash.new
|
18
|
+
|
19
|
+
#Token Stream
|
20
|
+
token_stream = @analyzer.token_stream(:field, @text)
|
21
|
+
while token = token_stream.next do
|
22
|
+
pos += token.pos_inc
|
23
|
+
|
24
|
+
@bag_of_words[token.text] = Array.new unless @bag_of_words[token.text]
|
25
|
+
@bag_of_words[token.text].push(pos)
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
def vector
|
31
|
+
vector = Hash.new
|
32
|
+
squared_length = 0
|
33
|
+
@bag_of_words.each do |term, pos_vector|
|
34
|
+
squared_length += pos_vector.size**2
|
35
|
+
#vector[[@field,term].join("||")] = pos_vector.size
|
36
|
+
end
|
37
|
+
|
38
|
+
length = squared_length.to_f**0.5
|
39
|
+
@bag_of_words.each do |term, pos_vector|
|
40
|
+
vector[[@field,term].join("||")] = pos_vector.size.to_f / length
|
41
|
+
end
|
42
|
+
|
43
|
+
vector
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Feature Vectors performs two important functions
|
4
|
+
#
|
5
|
+
# 1. Maintain a list of <feature, value> pairs for each data point (vector)
|
6
|
+
# so that a model can be built whenever needed. (Sparse representation)
|
7
|
+
#
|
8
|
+
# 2. Adds every feature to the Features object to maintain
|
9
|
+
# a unique list of features
|
10
|
+
#
|
11
|
+
# TODO: On disk representation for large training data
|
12
|
+
module Eluka
|
13
|
+
|
14
|
+
class FeatureVectors
|
15
|
+
|
16
|
+
# Feature Vectors for a data point need to know the
|
17
|
+
# global list of features and their respective ids
|
18
|
+
#
|
19
|
+
# During training, as we keep finding new features
|
20
|
+
# we add them to the features list
|
21
|
+
#
|
22
|
+
# Hence we need to know whether the vectors we are computing
|
23
|
+
# are for training or classification
|
24
|
+
|
25
|
+
def initialize (features, train)
|
26
|
+
@fvs = Array.new
|
27
|
+
@features = features #Instance of features
|
28
|
+
@train = train #Boolean
|
29
|
+
end
|
30
|
+
|
31
|
+
# We just keep all data points stored and convert them to
|
32
|
+
# feature vectors only on demand
|
33
|
+
|
34
|
+
def add (vector, label = 0)
|
35
|
+
@fvs.push([vector, label])
|
36
|
+
end
|
37
|
+
|
38
|
+
# For training data points we make sure all the features
|
39
|
+
# are added to the feature list
|
40
|
+
|
41
|
+
def define_features
|
42
|
+
@fvs.each do |vector, label|
|
43
|
+
vector.each do |term, value|
|
44
|
+
@features.add(term)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Creates feature vectors and converts them to
|
50
|
+
# LibSVM format -- a multiline string with one
|
51
|
+
# data point per line
|
52
|
+
#
|
53
|
+
# If provided with a list of selected features then
|
54
|
+
# insert only those features
|
55
|
+
|
56
|
+
def to_libSVM (sel_features = nil)
|
57
|
+
|
58
|
+
#Load the selected features into a Hash
|
59
|
+
sf = Hash.new
|
60
|
+
if (sel_features)
|
61
|
+
sel_features.each do |f|
|
62
|
+
sf[f] = 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
self.define_features if (@train) #This method is needed only for training data
|
67
|
+
|
68
|
+
output = Array.new
|
69
|
+
@fvs.each do |vector, label|
|
70
|
+
line = Array.new
|
71
|
+
line.push(label)
|
72
|
+
|
73
|
+
(1..@features.f_count).each do |id| #OPTIMIZE: Change this line to consider sorting in case of terms being features
|
74
|
+
term = @features.term(id)
|
75
|
+
if ( value = vector[term] ) then
|
76
|
+
line.push([id, value].join(":")) if sf[term] or not sel_features
|
77
|
+
end
|
78
|
+
end
|
79
|
+
output.push(line.join(" "))
|
80
|
+
end
|
81
|
+
output.join("\n")
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
|
2
|
+
module Eluka
|
3
|
+
|
4
|
+
class Features
|
5
|
+
def initialize
|
6
|
+
@features = Eluka::Bijection.new
|
7
|
+
@f_count = 0
|
8
|
+
end
|
9
|
+
|
10
|
+
attr_reader :f_count
|
11
|
+
|
12
|
+
def add (term)
|
13
|
+
unless @features[term] then
|
14
|
+
@f_count += 1
|
15
|
+
@features[term] = @f_count
|
16
|
+
end
|
17
|
+
return @features[term]
|
18
|
+
end
|
19
|
+
|
20
|
+
def id (term)
|
21
|
+
@features[term]
|
22
|
+
end
|
23
|
+
|
24
|
+
def term (id)
|
25
|
+
@features.lookup(id)
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
data/lib/eluka/model.rb
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
|
2
|
+
module Eluka
|
3
|
+
|
4
|
+
#A binary classifier classifies data into two classes given a category (the class label)
|
5
|
+
# a. Data which is indicative of the category -- positive data
|
6
|
+
# b. Data which is not indicative of the category -- negative data
|
7
|
+
#
|
8
|
+
#== Model
|
9
|
+
# A classifier model observes positve and negative data and learns the properties of
|
10
|
+
#each set. In the future if given an unlabelled data point it decides whether the
|
11
|
+
#the data point is a positive or negative instance of the category.
|
12
|
+
#
|
13
|
+
#=== Internal Data Representation
|
14
|
+
# A classifier model internally represents a data instance as a point in a vector space
|
15
|
+
# The dimensions of the vector space are termed as features
|
16
|
+
#
|
17
|
+
#=== Eluka::Model
|
18
|
+
# An Eluka model takes a hash of features and their values and internally processes them
|
19
|
+
#as points in a vector space. If the input is a string of words like in a document then
|
20
|
+
#it relies on Ferret's text anaysis modules to convert it into a data point
|
21
|
+
|
22
|
+
class Model
|
23
|
+
|
24
|
+
include Ferret::Analysis
|
25
|
+
|
26
|
+
# Initialize the classifier with sane defaults
|
27
|
+
# if customised data is not provided
|
28
|
+
|
29
|
+
def initialize (params = {})
|
30
|
+
#Set the labels
|
31
|
+
@labels = Bijection.new
|
32
|
+
@labels[:positive] = 1
|
33
|
+
@labels[:negative] = -1
|
34
|
+
@labels[:unknown] = 0
|
35
|
+
|
36
|
+
@gem_root = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
37
|
+
@bin_dir = File.expand_path(File.join(File.dirname(@gem_root), 'bin'))
|
38
|
+
|
39
|
+
@analyzer = StandardAnalyzer.new
|
40
|
+
@features = Eluka::Features.new
|
41
|
+
@fv_train = Eluka::FeatureVectors.new(@features, true)
|
42
|
+
@fv_test = nil
|
43
|
+
|
44
|
+
@directory = (params[:directory] or "/tmp")
|
45
|
+
@svm_train_path = (params[:svm_train_path] or "#{@bin_dir}/eluka-svm-train")
|
46
|
+
@svm_scale_path = (params[:svm_scale_path] or "#{@bin_dir}/eluka-svm-scale")
|
47
|
+
@svm_predict_path = (params[:svm_predict_path] or "#{@bin_dir}/eluka-svm-predict")
|
48
|
+
@grid_py_path = (params[:grid_py_path] or "python rsvm/tools/grid.py")
|
49
|
+
@fselect_py_path = (params[:fselect_py_path] or "python rsvm/tools/fselect.py")
|
50
|
+
@verbose = (params[:verbose] or false)
|
51
|
+
|
52
|
+
#Convert directory to absolute path
|
53
|
+
Dir.chdir(@directory) do @directory = Dir.pwd end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Add a data point to the training data
|
57
|
+
|
58
|
+
def add (data, label)
|
59
|
+
raise "No meaningful label associated with data" unless ([:positive, :negative].include? label)
|
60
|
+
|
61
|
+
data_point = Eluka::DataPoint.new(data, @analyzer)
|
62
|
+
@fv_train.add(data_point.vector, @labels[label])
|
63
|
+
end
|
64
|
+
|
65
|
+
# Build a model from the training data using LibSVM
|
66
|
+
|
67
|
+
def build (features = nil)
|
68
|
+
File.open(@directory + "/train", "w") do |f| f.puts @fv_train.to_libSVM(features) end
|
69
|
+
|
70
|
+
output = `#{@svm_train_path} #{@directory}/train #{@directory}/model`
|
71
|
+
|
72
|
+
puts output if (@verbose)
|
73
|
+
|
74
|
+
@fv_test = Eluka::FeatureVectors.new(@features, false)
|
75
|
+
return output
|
76
|
+
end
|
77
|
+
|
78
|
+
# Classify a data point
|
79
|
+
|
80
|
+
def classify (data, features = nil)
|
81
|
+
raise "Untrained model" unless (@fv_test)
|
82
|
+
|
83
|
+
data_point = Eluka::DataPoint.new(data, @analyzer)
|
84
|
+
@fv_test.add(data_point.vector)
|
85
|
+
|
86
|
+
File.open(@directory + "/classify", "w") do |f| f.puts @fv_test.to_libSVM(features) end
|
87
|
+
output = `#{@svm_predict_path} #{@directory}/classify #{@directory}/model #{@directory}/result`
|
88
|
+
|
89
|
+
puts output if (@verbose)
|
90
|
+
|
91
|
+
return @labels.lookup( File.open( @directory + "/result", "r" ).read.to_i )
|
92
|
+
end
|
93
|
+
|
94
|
+
# Suggests the best set of features chosen using fselect.py
|
95
|
+
# IMPROVE: Depending on fselect.py (an unnecessary python dependency) is stupid
|
96
|
+
# TODO: Finish wirting fselect.rb and integrate it
|
97
|
+
|
98
|
+
def suggest_features
|
99
|
+
sel_features = Array.new
|
100
|
+
|
101
|
+
File.open(@directory + "/train", "w") do |f| f.puts @fv_train.to_libSVM end
|
102
|
+
|
103
|
+
Dir.chdir('./rsvm/bin/tools') do
|
104
|
+
output = `python fselect.py #{@directory}/train`
|
105
|
+
|
106
|
+
puts output if (@verbose)
|
107
|
+
|
108
|
+
x = File.read("train.select")
|
109
|
+
sel_f_ids = x[1..-2].split(", ")
|
110
|
+
sel_f_ids.each do |f|
|
111
|
+
s_f = @features.term(f.to_i)
|
112
|
+
if s_f.instance_of? String then
|
113
|
+
s_f = s_f.split("||")
|
114
|
+
s_f[0] = s_f[0].to_sym
|
115
|
+
end
|
116
|
+
sel_features.push(s_f)
|
117
|
+
end
|
118
|
+
|
119
|
+
#Remove temporary files
|
120
|
+
File.delete("train.select") if File.exist?("train.select")
|
121
|
+
File.delete("train.fscore") if File.exist?("train.fscore")
|
122
|
+
File.delete("train.tr.out") if File.exist?("train.tr.out")
|
123
|
+
end
|
124
|
+
|
125
|
+
return sel_features
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
end
|
data/lib/fselect.rb
ADDED
@@ -0,0 +1,321 @@
|
|
1
|
+
#trying to convert fselect.py method by method into ruby
|
2
|
+
require 'rbconfig'
|
3
|
+
|
4
|
+
##### Path Setting #####
|
5
|
+
is_win32 = (Config::CONFIG["host_os"] == 'win32')
|
6
|
+
unless is_win32
|
7
|
+
gridpy_exe = "./grid.py -log2c -2,9,2 -log2g 1,-11,-2"
|
8
|
+
svmtrain_exe = "../svm-train"
|
9
|
+
svmpredict_exe = "../svm-predict"
|
10
|
+
else
|
11
|
+
gridpy_exe = ".\\grid.py -log2c -2,9,2 -log2g 1,-11,-2"
|
12
|
+
svmtrain_exe = "..\\windows\\svmtrain.exe"
|
13
|
+
svmpredict_exe = "..\\windows\\svmpredict.exe"
|
14
|
+
end
|
15
|
+
|
16
|
+
##### Global Variables #####
|
17
|
+
|
18
|
+
@train_pathfile=""
|
19
|
+
@train_file=""
|
20
|
+
@test_pathfile=""
|
21
|
+
@test_file=""
|
22
|
+
@if_predict_all=0
|
23
|
+
|
24
|
+
@whole_fsc_dict={}
|
25
|
+
@whole_imp_v=[]
|
26
|
+
|
27
|
+
VERBOSE_MAX=100
|
28
|
+
VERBOSE_ITER = 3
|
29
|
+
VERBOSE_GRID_TIME = 2
|
30
|
+
VERBOSE_TIME = 1
|
31
|
+
|
32
|
+
def arg_process
|
33
|
+
unless (ARGV.size == 2 or ARGV.size == 3)
|
34
|
+
puts 'Usage: #{ARGV[0]} training_file [testing_file]'
|
35
|
+
exit
|
36
|
+
end
|
37
|
+
|
38
|
+
@train_pathfile = ARGV[1]
|
39
|
+
raise "training file not found" unless File.exist? @train_pathfile
|
40
|
+
@train_file = File.basename(@train_pathfile)
|
41
|
+
|
42
|
+
if ARGV.size == 3
|
43
|
+
@test_pathfile = ARGV[1]
|
44
|
+
raise "testing file not found" unless File.exist? @test_pathfile
|
45
|
+
@test_file = File.basename(@test_pathfile)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
##### Decide sizes of selected feautures #####
|
51
|
+
|
52
|
+
def feat_num_try_half(max_index)
|
53
|
+
v=[]
|
54
|
+
while max_index > 1 do
|
55
|
+
v.push(max_index)
|
56
|
+
max_index /= 2
|
57
|
+
end
|
58
|
+
return v
|
59
|
+
end
|
60
|
+
|
61
|
+
def feat_num_try(f_tuple)
|
62
|
+
for i in 0...f_tuple.size do
|
63
|
+
if f_tuple[i][1] < 1e-20
|
64
|
+
i = i - 1
|
65
|
+
break
|
66
|
+
end
|
67
|
+
end
|
68
|
+
#only take first eight numbers (>1%)
|
69
|
+
return feat_num_try_half(i+1)[0...8]
|
70
|
+
end
|
71
|
+
|
72
|
+
def random_shuffle(label, sample)
|
73
|
+
srand 1
|
74
|
+
size = label.size
|
75
|
+
for i in 0...label.size
|
76
|
+
ri = rand(size)
|
77
|
+
tmp = label[ri]
|
78
|
+
label[ri] = label[size-i-1]
|
79
|
+
label[size-i-1] = tmp
|
80
|
+
tmp = sample[ri]
|
81
|
+
sample[ri] = sample[size-i-1]
|
82
|
+
sample[size-i-1] = tmp
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
### compare function used in list.sort(): sort by element[1]
|
88
|
+
#def value_cmpf(x,y):
|
89
|
+
# if x[1]>y[1]: return -1
|
90
|
+
# if x[1]<y[1]: return 1
|
91
|
+
# return 0
|
92
|
+
|
93
|
+
def value_cmpf(x)
|
94
|
+
return (-x[1])
|
95
|
+
end
|
96
|
+
|
97
|
+
### cal importance of features
|
98
|
+
### return fscore_dict and feat with desc order
|
99
|
+
def cal_feat_imp(label, sample)
|
100
|
+
|
101
|
+
puts("calculating fsc...")
|
102
|
+
|
103
|
+
score_dict = cal_Fscore(label, sample)
|
104
|
+
|
105
|
+
#NOTE: Convert the following two lines carefully
|
106
|
+
score_tuples = list(score_dict.items())
|
107
|
+
score_tuples.sort(key = value_cmpf)
|
108
|
+
|
109
|
+
feat_v = score_tuples
|
110
|
+
for i in 0...feat_v.size
|
111
|
+
feat_v[i] = score_tuples[i][0]
|
112
|
+
end
|
113
|
+
|
114
|
+
puts("fsc done")
|
115
|
+
return score_dict,feat_v
|
116
|
+
end
|
117
|
+
|
118
|
+
|
119
|
+
|
120
|
+
### select features and return new data
|
121
|
+
def select(sample, feat_v)
|
122
|
+
new_samp = []
|
123
|
+
|
124
|
+
feat_v.sort()
|
125
|
+
|
126
|
+
#for each sample
|
127
|
+
sample.each do |key, s| #NOTE: Extremely doubtful conversion
|
128
|
+
point = Hash.new
|
129
|
+
#for each feature to select
|
130
|
+
feat_v.each do |f|
|
131
|
+
if s[f]
|
132
|
+
point[f]=s[f]
|
133
|
+
end
|
134
|
+
end
|
135
|
+
new_samp.push(point)
|
136
|
+
end
|
137
|
+
return new_samp
|
138
|
+
end
|
139
|
+
|
140
|
+
|
141
|
+
=begin
|
142
|
+
#TODO: Convert the following code
|
143
|
+
|
144
|
+
### Do parameter searching (grid.py)
|
145
|
+
def train_svm(tr_file)
|
146
|
+
cmd = "#{gridpy_exe} #{tr_file}"
|
147
|
+
puts(cmd)
|
148
|
+
puts('Cross validation...')
|
149
|
+
std_out = Popen(cmd, shell = True, stdout = PIPE).stdout
|
150
|
+
|
151
|
+
line = ''
|
152
|
+
while 1:
|
153
|
+
last_line = line
|
154
|
+
line = std_out.readline()
|
155
|
+
if not line: break
|
156
|
+
c,g,rate = map(float,last_line.split())
|
157
|
+
|
158
|
+
print('Best c=%s, g=%s CV rate=%s' % (c,g,rate))
|
159
|
+
|
160
|
+
return c,g,rate
|
161
|
+
|
162
|
+
### Given (C,g) and training/testing data,
|
163
|
+
### return predicted labels
|
164
|
+
def predict(tr_label, tr_sample, c, g, test_label, test_sample, del_model=1, model_name=None):
|
165
|
+
global train_file
|
166
|
+
tr_file = train_file+".tr"
|
167
|
+
te_file = train_file+".te"
|
168
|
+
if model_name: model_file = model_name
|
169
|
+
else: model_file = "%s.model"%tr_file
|
170
|
+
out_file = "%s.o"%te_file
|
171
|
+
|
172
|
+
# train
|
173
|
+
writedata(tr_sample,tr_label,tr_file)
|
174
|
+
cmd = "%s -c %f -g %f %s %s" % (svmtrain_exe,c,g,tr_file,model_file)
|
175
|
+
os.system(cmd)
|
176
|
+
|
177
|
+
# test
|
178
|
+
writedata(test_sample,test_label,te_file)
|
179
|
+
cmd = "%s %s %s %s" % (svmpredict_exe, te_file,model_file,out_file )
|
180
|
+
print(cmd)
|
181
|
+
os.system(cmd)
|
182
|
+
|
183
|
+
# fill in pred_y
|
184
|
+
pred_y=[]
|
185
|
+
fp = open(out_file)
|
186
|
+
line = fp.readline()
|
187
|
+
while line:
|
188
|
+
pred_y.append( float(line) )
|
189
|
+
line = fp.readline()
|
190
|
+
|
191
|
+
rem_file(tr_file)
|
192
|
+
#rem_file("%s.out"%tr_file)
|
193
|
+
#rem_file("%s.png"%tr_file)
|
194
|
+
rem_file(te_file)
|
195
|
+
if del_model: rem_file(model_file)
|
196
|
+
fp.close()
|
197
|
+
rem_file(out_file)
|
198
|
+
|
199
|
+
return pred_y
|
200
|
+
|
201
|
+
|
202
|
+
def cal_acc(pred_y, real_y):
|
203
|
+
right = 0.0
|
204
|
+
|
205
|
+
for i in range(len(pred_y)):
|
206
|
+
if(pred_y[i] == real_y[i]): right += 1
|
207
|
+
|
208
|
+
print("ACC: %d/%d"%(right, len(pred_y)))
|
209
|
+
return right/len(pred_y)
|
210
|
+
|
211
|
+
### balanced accuracy
|
212
|
+
def cal_bacc(pred_y, real_y):
|
213
|
+
p_right = 0.0
|
214
|
+
n_right = 0.0
|
215
|
+
p_num = 0
|
216
|
+
n_num = 0
|
217
|
+
|
218
|
+
size=len(pred_y)
|
219
|
+
for i in range(size):
|
220
|
+
if real_y[i] == 1:
|
221
|
+
p_num+=1
|
222
|
+
if real_y[i]==pred_y[i]: p_right+=1
|
223
|
+
else:
|
224
|
+
n_num+=1
|
225
|
+
if real_y[i]==pred_y[i]: n_right+=1
|
226
|
+
|
227
|
+
print([p_right,p_num,n_right,n_num])
|
228
|
+
writelog(" p_yes/p_num, n_yes/n_num: %d/%d , %d/%d\n"%(p_right,p_num,n_right,n_num))
|
229
|
+
if p_num==0: p_num=1
|
230
|
+
if n_num==0: n_num=1
|
231
|
+
return 0.5*( p_right/p_num + n_right/n_num )
|
232
|
+
=end
|
233
|
+
|
234
|
+
##### Log related #####
|
235
|
+
def initlog(name)
|
236
|
+
@logname = name
|
237
|
+
logfile = File.open(@logname, "w").close
|
238
|
+
end
|
239
|
+
|
240
|
+
def writelog(str, vlevel = VERBOSE_MAX)
|
241
|
+
if vlevel > VERBOSE_ITER
|
242
|
+
logfile = File.open(@logname, "a")
|
243
|
+
logfile.print(str)
|
244
|
+
logfile.close
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
###### svm data IO ######
|
249
|
+
|
250
|
+
def readdata(filename)
|
251
|
+
labels = Array.new
|
252
|
+
samples = Array.new
|
253
|
+
max_index = 0
|
254
|
+
|
255
|
+
f = File.open(filename)
|
256
|
+
|
257
|
+
f.each_line do |line|
|
258
|
+
line.chomp!
|
259
|
+
next if line[0] == "#"
|
260
|
+
|
261
|
+
elems = line.split(" ")
|
262
|
+
sample = Hash.new
|
263
|
+
label_read = false
|
264
|
+
elements.each do |e|
|
265
|
+
unless label_read
|
266
|
+
labels.push e.to_f
|
267
|
+
label_read = true
|
268
|
+
next
|
269
|
+
end
|
270
|
+
|
271
|
+
feature, value = e.split(":")
|
272
|
+
p0 = feature.chomp.to_i
|
273
|
+
p1 = value.chomp.to_f
|
274
|
+
sample[p0] = p1
|
275
|
+
|
276
|
+
max_index = p0 if p0 > max_index
|
277
|
+
|
278
|
+
samples.push(sample)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
f.close
|
283
|
+
|
284
|
+
return labels, samples, max_index
|
285
|
+
end
|
286
|
+
|
287
|
+
def writedata(samples, labels, filename)
|
288
|
+
fp = $stdout
|
289
|
+
if filename
|
290
|
+
fp = File.open(filename, "w")
|
291
|
+
end
|
292
|
+
|
293
|
+
num = samples.size
|
294
|
+
samples.each_index do |i|
|
295
|
+
if labels
|
296
|
+
fp.print label[i]
|
297
|
+
else
|
298
|
+
fp.print "0"
|
299
|
+
end
|
300
|
+
samples[i].keys.sort.each do |k|
|
301
|
+
fp.print(" #{k}:#{samples[i][k]}")
|
302
|
+
end
|
303
|
+
fp.puts ""
|
304
|
+
end
|
305
|
+
fp.close
|
306
|
+
end
|
307
|
+
|
308
|
+
###### PROGRAM ENTRY POINT ######
|
309
|
+
|
310
|
+
arg_process()
|
311
|
+
|
312
|
+
initlog("#{@train_file}.select")
|
313
|
+
writelog("start: #{Time.now}\n\n")
|
314
|
+
main()
|
315
|
+
|
316
|
+
# do testing on all possible feature sets
|
317
|
+
if if_predict_all
|
318
|
+
predict_all()
|
319
|
+
end
|
320
|
+
|
321
|
+
writelog("\nend: \n#{Time.now}\n")
|