eluka 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (73) hide show
  1. data/.document +5 -0
  2. data/DOCUMENTATION_STANDARDS +39 -0
  3. data/Gemfile +13 -0
  4. data/Gemfile.lock +20 -0
  5. data/LICENSE.txt +20 -0
  6. data/README.rdoc +19 -0
  7. data/Rakefile +69 -0
  8. data/VERSION +1 -0
  9. data/examples/example.rb +59 -0
  10. data/ext/libsvm/COPYRIGHT +31 -0
  11. data/ext/libsvm/FAQ.html +1749 -0
  12. data/ext/libsvm/Makefile +25 -0
  13. data/ext/libsvm/Makefile.win +33 -0
  14. data/ext/libsvm/README +733 -0
  15. data/ext/libsvm/extconf.rb +1 -0
  16. data/ext/libsvm/heart_scale +270 -0
  17. data/ext/libsvm/java/Makefile +25 -0
  18. data/ext/libsvm/java/libsvm.jar +0 -0
  19. data/ext/libsvm/java/libsvm/svm.java +2776 -0
  20. data/ext/libsvm/java/libsvm/svm.m4 +2776 -0
  21. data/ext/libsvm/java/libsvm/svm_model.java +21 -0
  22. data/ext/libsvm/java/libsvm/svm_node.java +6 -0
  23. data/ext/libsvm/java/libsvm/svm_parameter.java +47 -0
  24. data/ext/libsvm/java/libsvm/svm_print_interface.java +5 -0
  25. data/ext/libsvm/java/libsvm/svm_problem.java +7 -0
  26. data/ext/libsvm/java/svm_predict.java +163 -0
  27. data/ext/libsvm/java/svm_scale.java +350 -0
  28. data/ext/libsvm/java/svm_toy.java +471 -0
  29. data/ext/libsvm/java/svm_train.java +318 -0
  30. data/ext/libsvm/java/test_applet.html +1 -0
  31. data/ext/libsvm/python/Makefile +4 -0
  32. data/ext/libsvm/python/README +331 -0
  33. data/ext/libsvm/python/svm.py +259 -0
  34. data/ext/libsvm/python/svmutil.py +242 -0
  35. data/ext/libsvm/svm-predict.c +226 -0
  36. data/ext/libsvm/svm-scale.c +353 -0
  37. data/ext/libsvm/svm-toy/gtk/Makefile +22 -0
  38. data/ext/libsvm/svm-toy/gtk/callbacks.cpp +423 -0
  39. data/ext/libsvm/svm-toy/gtk/callbacks.h +54 -0
  40. data/ext/libsvm/svm-toy/gtk/interface.c +164 -0
  41. data/ext/libsvm/svm-toy/gtk/interface.h +14 -0
  42. data/ext/libsvm/svm-toy/gtk/main.c +23 -0
  43. data/ext/libsvm/svm-toy/gtk/svm-toy.glade +238 -0
  44. data/ext/libsvm/svm-toy/qt/Makefile +17 -0
  45. data/ext/libsvm/svm-toy/qt/svm-toy.cpp +413 -0
  46. data/ext/libsvm/svm-toy/windows/svm-toy.cpp +456 -0
  47. data/ext/libsvm/svm-train.c +376 -0
  48. data/ext/libsvm/svm.cpp +3060 -0
  49. data/ext/libsvm/svm.def +19 -0
  50. data/ext/libsvm/svm.h +105 -0
  51. data/ext/libsvm/svm.o +0 -0
  52. data/ext/libsvm/tools/README +149 -0
  53. data/ext/libsvm/tools/checkdata.py +108 -0
  54. data/ext/libsvm/tools/easy.py +79 -0
  55. data/ext/libsvm/tools/grid.py +359 -0
  56. data/ext/libsvm/tools/subset.py +146 -0
  57. data/ext/libsvm/windows/libsvm.dll +0 -0
  58. data/ext/libsvm/windows/svm-predict.exe +0 -0
  59. data/ext/libsvm/windows/svm-scale.exe +0 -0
  60. data/ext/libsvm/windows/svm-toy.exe +0 -0
  61. data/ext/libsvm/windows/svm-train.exe +0 -0
  62. data/lib/eluka.rb +10 -0
  63. data/lib/eluka/bijection.rb +23 -0
  64. data/lib/eluka/data_point.rb +36 -0
  65. data/lib/eluka/document.rb +47 -0
  66. data/lib/eluka/feature_vector.rb +86 -0
  67. data/lib/eluka/features.rb +31 -0
  68. data/lib/eluka/model.rb +129 -0
  69. data/lib/fselect.rb +321 -0
  70. data/lib/grid.rb +25 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_eluka.rb +7 -0
  73. metadata +214 -0
Binary file
Binary file
Binary file
Binary file
Binary file
data/lib/eluka.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'ferret'
3
+
4
+ require 'eluka/features'
5
+ require 'eluka/document'
6
+ require 'eluka/feature_vector'
7
+ require 'eluka/model'
8
+ require 'eluka/bijection'
9
+ require 'eluka/data_point'
10
+
@@ -0,0 +1,23 @@
1
+ #Creates a two way hash like lookup using two hashes
2
+ #Source inspired from a post in some forum
3
+ #Author: Unknown
4
+ module Eluka
5
+ class Bijection < Hash
6
+ def initialize(*args)
7
+ super(*args)
8
+ @reverse = self.invert
9
+ end
10
+
11
+ def []=(key, val)
12
+ super(key, val)
13
+ if @reverse.has_key?(val)
14
+ self.delete(@reverse[val])
15
+ end
16
+ @reverse[val] = key
17
+ end
18
+
19
+ def lookup(val)
20
+ @reverse[val]
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,36 @@
1
+ module Eluka
2
+ class DataPoint
3
+
4
+ def initialize(data, analyzer)
5
+ raise "Can't find any data" unless (data)
6
+
7
+ if data.instance_of?(String)
8
+ data = {:text => data}
9
+ end
10
+
11
+ raise "Invalid data added" unless (data.instance_of? Hash)
12
+ raise "Data can't be empty" unless (data.size > 0)
13
+
14
+ @data = data
15
+ @analyzer = analyzer
16
+ end
17
+
18
+ def vector
19
+ vector = Hash.new
20
+
21
+ @data.each do |field, value|
22
+ if value.instance_of?(String) then
23
+ doc_vec = Eluka::Document.new(field, value, @analyzer).vector
24
+ vector.merge!(doc_vec)
25
+ elsif value.instance_of?(Fixnum) or value.instance_of?(Float)
26
+ vector[field] = value
27
+ else
28
+ raise "A field can contain either an integer or a double or it can be a string"
29
+ end
30
+ end
31
+
32
+ vector
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,47 @@
1
+
2
+ module Eluka
3
+
4
+ class Document
5
+ def initialize(field, text, analyzer)
6
+ @field = field
7
+ @text = text
8
+ @analyzer = analyzer
9
+ @bag_of_words = nil
10
+ self.bag_of_words
11
+ end
12
+
13
+ def bag_of_words
14
+ #Position counter for the document
15
+ pos = 0
16
+
17
+ @bag_of_words = Hash.new
18
+
19
+ #Token Stream
20
+ token_stream = @analyzer.token_stream(:field, @text)
21
+ while token = token_stream.next do
22
+ pos += token.pos_inc
23
+
24
+ @bag_of_words[token.text] = Array.new unless @bag_of_words[token.text]
25
+ @bag_of_words[token.text].push(pos)
26
+ end
27
+
28
+ end
29
+
30
+ def vector
31
+ vector = Hash.new
32
+ squared_length = 0
33
+ @bag_of_words.each do |term, pos_vector|
34
+ squared_length += pos_vector.size**2
35
+ #vector[[@field,term].join("||")] = pos_vector.size
36
+ end
37
+
38
+ length = squared_length.to_f**0.5
39
+ @bag_of_words.each do |term, pos_vector|
40
+ vector[[@field,term].join("||")] = pos_vector.size.to_f / length
41
+ end
42
+
43
+ vector
44
+ end
45
+ end
46
+
47
+ end
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Feature Vectors performs two important functions
4
+ #
5
+ # 1. Maintain a list of <feature, value> pairs for each data point (vector)
6
+ # so that a model can be built whenever needed. (Sparse representation)
7
+ #
8
+ # 2. Adds every feature to the Features object to maintain
9
+ # a unique list of features
10
+ #
11
+ # TODO: On disk representation for large training data
12
+ module Eluka
13
+
14
+ class FeatureVectors
15
+
16
+ # Feature Vectors for a data point need to know the
17
+ # global list of features and their respective ids
18
+ #
19
+ # During training, as we keep finding new features
20
+ # we add them to the features list
21
+ #
22
+ # Hence we need to know whether the vectors we are computing
23
+ # are for training or classification
24
+
25
+ def initialize (features, train)
26
+ @fvs = Array.new
27
+ @features = features #Instance of features
28
+ @train = train #Boolean
29
+ end
30
+
31
+ # We just keep all data points stored and convert them to
32
+ # feature vectors only on demand
33
+
34
+ def add (vector, label = 0)
35
+ @fvs.push([vector, label])
36
+ end
37
+
38
+ # For training data points we make sure all the features
39
+ # are added to the feature list
40
+
41
+ def define_features
42
+ @fvs.each do |vector, label|
43
+ vector.each do |term, value|
44
+ @features.add(term)
45
+ end
46
+ end
47
+ end
48
+
49
+ # Creates feature vectors and converts them to
50
+ # LibSVM format -- a multiline string with one
51
+ # data point per line
52
+ #
53
+ # If provided with a list of selected features then
54
+ # insert only those features
55
+
56
+ def to_libSVM (sel_features = nil)
57
+
58
+ #Load the selected features into a Hash
59
+ sf = Hash.new
60
+ if (sel_features)
61
+ sel_features.each do |f|
62
+ sf[f] = 1
63
+ end
64
+ end
65
+
66
+ self.define_features if (@train) #This method is needed only for training data
67
+
68
+ output = Array.new
69
+ @fvs.each do |vector, label|
70
+ line = Array.new
71
+ line.push(label)
72
+
73
+ (1..@features.f_count).each do |id| #OPTIMIZE: Change this line to consider sorting in case of terms being features
74
+ term = @features.term(id)
75
+ if ( value = vector[term] ) then
76
+ line.push([id, value].join(":")) if sf[term] or not sel_features
77
+ end
78
+ end
79
+ output.push(line.join(" "))
80
+ end
81
+ output.join("\n")
82
+ end
83
+
84
+ end
85
+
86
+ end
@@ -0,0 +1,31 @@
1
+
2
+ module Eluka
3
+
4
+ class Features
5
+ def initialize
6
+ @features = Eluka::Bijection.new
7
+ @f_count = 0
8
+ end
9
+
10
+ attr_reader :f_count
11
+
12
+ def add (term)
13
+ unless @features[term] then
14
+ @f_count += 1
15
+ @features[term] = @f_count
16
+ end
17
+ return @features[term]
18
+ end
19
+
20
+ def id (term)
21
+ @features[term]
22
+ end
23
+
24
+ def term (id)
25
+ @features.lookup(id)
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+
@@ -0,0 +1,129 @@
1
+
2
+ module Eluka
3
+
4
+ #A binary classifier classifies data into two classes given a category (the class label)
5
+ # a. Data which is indicative of the category -- positive data
6
+ # b. Data which is not indicative of the category -- negative data
7
+ #
8
+ #== Model
9
+ # A classifier model observes positve and negative data and learns the properties of
10
+ #each set. In the future if given an unlabelled data point it decides whether the
11
+ #the data point is a positive or negative instance of the category.
12
+ #
13
+ #=== Internal Data Representation
14
+ # A classifier model internally represents a data instance as a point in a vector space
15
+ # The dimensions of the vector space are termed as features
16
+ #
17
+ #=== Eluka::Model
18
+ # An Eluka model takes a hash of features and their values and internally processes them
19
+ #as points in a vector space. If the input is a string of words like in a document then
20
+ #it relies on Ferret's text anaysis modules to convert it into a data point
21
+
22
+ class Model
23
+
24
+ include Ferret::Analysis
25
+
26
+ # Initialize the classifier with sane defaults
27
+ # if customised data is not provided
28
+
29
+ def initialize (params = {})
30
+ #Set the labels
31
+ @labels = Bijection.new
32
+ @labels[:positive] = 1
33
+ @labels[:negative] = -1
34
+ @labels[:unknown] = 0
35
+
36
+ @gem_root = File.expand_path(File.join(File.dirname(__FILE__), '..'))
37
+ @bin_dir = File.expand_path(File.join(File.dirname(@gem_root), 'bin'))
38
+
39
+ @analyzer = StandardAnalyzer.new
40
+ @features = Eluka::Features.new
41
+ @fv_train = Eluka::FeatureVectors.new(@features, true)
42
+ @fv_test = nil
43
+
44
+ @directory = (params[:directory] or "/tmp")
45
+ @svm_train_path = (params[:svm_train_path] or "#{@bin_dir}/eluka-svm-train")
46
+ @svm_scale_path = (params[:svm_scale_path] or "#{@bin_dir}/eluka-svm-scale")
47
+ @svm_predict_path = (params[:svm_predict_path] or "#{@bin_dir}/eluka-svm-predict")
48
+ @grid_py_path = (params[:grid_py_path] or "python rsvm/tools/grid.py")
49
+ @fselect_py_path = (params[:fselect_py_path] or "python rsvm/tools/fselect.py")
50
+ @verbose = (params[:verbose] or false)
51
+
52
+ #Convert directory to absolute path
53
+ Dir.chdir(@directory) do @directory = Dir.pwd end
54
+ end
55
+
56
+ # Add a data point to the training data
57
+
58
+ def add (data, label)
59
+ raise "No meaningful label associated with data" unless ([:positive, :negative].include? label)
60
+
61
+ data_point = Eluka::DataPoint.new(data, @analyzer)
62
+ @fv_train.add(data_point.vector, @labels[label])
63
+ end
64
+
65
+ # Build a model from the training data using LibSVM
66
+
67
+ def build (features = nil)
68
+ File.open(@directory + "/train", "w") do |f| f.puts @fv_train.to_libSVM(features) end
69
+
70
+ output = `#{@svm_train_path} #{@directory}/train #{@directory}/model`
71
+
72
+ puts output if (@verbose)
73
+
74
+ @fv_test = Eluka::FeatureVectors.new(@features, false)
75
+ return output
76
+ end
77
+
78
+ # Classify a data point
79
+
80
+ def classify (data, features = nil)
81
+ raise "Untrained model" unless (@fv_test)
82
+
83
+ data_point = Eluka::DataPoint.new(data, @analyzer)
84
+ @fv_test.add(data_point.vector)
85
+
86
+ File.open(@directory + "/classify", "w") do |f| f.puts @fv_test.to_libSVM(features) end
87
+ output = `#{@svm_predict_path} #{@directory}/classify #{@directory}/model #{@directory}/result`
88
+
89
+ puts output if (@verbose)
90
+
91
+ return @labels.lookup( File.open( @directory + "/result", "r" ).read.to_i )
92
+ end
93
+
94
+ # Suggests the best set of features chosen using fselect.py
95
+ # IMPROVE: Depending on fselect.py (an unnecessary python dependency) is stupid
96
+ # TODO: Finish wirting fselect.rb and integrate it
97
+
98
+ def suggest_features
99
+ sel_features = Array.new
100
+
101
+ File.open(@directory + "/train", "w") do |f| f.puts @fv_train.to_libSVM end
102
+
103
+ Dir.chdir('./rsvm/bin/tools') do
104
+ output = `python fselect.py #{@directory}/train`
105
+
106
+ puts output if (@verbose)
107
+
108
+ x = File.read("train.select")
109
+ sel_f_ids = x[1..-2].split(", ")
110
+ sel_f_ids.each do |f|
111
+ s_f = @features.term(f.to_i)
112
+ if s_f.instance_of? String then
113
+ s_f = s_f.split("||")
114
+ s_f[0] = s_f[0].to_sym
115
+ end
116
+ sel_features.push(s_f)
117
+ end
118
+
119
+ #Remove temporary files
120
+ File.delete("train.select") if File.exist?("train.select")
121
+ File.delete("train.fscore") if File.exist?("train.fscore")
122
+ File.delete("train.tr.out") if File.exist?("train.tr.out")
123
+ end
124
+
125
+ return sel_features
126
+ end
127
+ end
128
+
129
+ end
data/lib/fselect.rb ADDED
@@ -0,0 +1,321 @@
1
+ #trying to convert fselect.py method by method into ruby
2
+ require 'rbconfig'
3
+
4
+ ##### Path Setting #####
5
+ is_win32 = (Config::CONFIG["host_os"] == 'win32')
6
+ unless is_win32
7
+ gridpy_exe = "./grid.py -log2c -2,9,2 -log2g 1,-11,-2"
8
+ svmtrain_exe = "../svm-train"
9
+ svmpredict_exe = "../svm-predict"
10
+ else
11
+ gridpy_exe = ".\\grid.py -log2c -2,9,2 -log2g 1,-11,-2"
12
+ svmtrain_exe = "..\\windows\\svmtrain.exe"
13
+ svmpredict_exe = "..\\windows\\svmpredict.exe"
14
+ end
15
+
16
+ ##### Global Variables #####
17
+
18
+ @train_pathfile=""
19
+ @train_file=""
20
+ @test_pathfile=""
21
+ @test_file=""
22
+ @if_predict_all=0
23
+
24
+ @whole_fsc_dict={}
25
+ @whole_imp_v=[]
26
+
27
+ VERBOSE_MAX=100
28
+ VERBOSE_ITER = 3
29
+ VERBOSE_GRID_TIME = 2
30
+ VERBOSE_TIME = 1
31
+
32
+ def arg_process
33
+ unless (ARGV.size == 2 or ARGV.size == 3)
34
+ puts 'Usage: #{ARGV[0]} training_file [testing_file]'
35
+ exit
36
+ end
37
+
38
+ @train_pathfile = ARGV[1]
39
+ raise "training file not found" unless File.exist? @train_pathfile
40
+ @train_file = File.basename(@train_pathfile)
41
+
42
+ if ARGV.size == 3
43
+ @test_pathfile = ARGV[1]
44
+ raise "testing file not found" unless File.exist? @test_pathfile
45
+ @test_file = File.basename(@test_pathfile)
46
+ end
47
+ end
48
+
49
+
50
+ ##### Decide sizes of selected feautures #####
51
+
52
+ def feat_num_try_half(max_index)
53
+ v=[]
54
+ while max_index > 1 do
55
+ v.push(max_index)
56
+ max_index /= 2
57
+ end
58
+ return v
59
+ end
60
+
61
+ def feat_num_try(f_tuple)
62
+ for i in 0...f_tuple.size do
63
+ if f_tuple[i][1] < 1e-20
64
+ i = i - 1
65
+ break
66
+ end
67
+ end
68
+ #only take first eight numbers (>1%)
69
+ return feat_num_try_half(i+1)[0...8]
70
+ end
71
+
72
+ def random_shuffle(label, sample)
73
+ srand 1
74
+ size = label.size
75
+ for i in 0...label.size
76
+ ri = rand(size)
77
+ tmp = label[ri]
78
+ label[ri] = label[size-i-1]
79
+ label[size-i-1] = tmp
80
+ tmp = sample[ri]
81
+ sample[ri] = sample[size-i-1]
82
+ sample[size-i-1] = tmp
83
+ end
84
+ end
85
+
86
+
87
+ ### compare function used in list.sort(): sort by element[1]
88
+ #def value_cmpf(x,y):
89
+ # if x[1]>y[1]: return -1
90
+ # if x[1]<y[1]: return 1
91
+ # return 0
92
+
93
+ def value_cmpf(x)
94
+ return (-x[1])
95
+ end
96
+
97
+ ### cal importance of features
98
+ ### return fscore_dict and feat with desc order
99
+ def cal_feat_imp(label, sample)
100
+
101
+ puts("calculating fsc...")
102
+
103
+ score_dict = cal_Fscore(label, sample)
104
+
105
+ #NOTE: Convert the following two lines carefully
106
+ score_tuples = list(score_dict.items())
107
+ score_tuples.sort(key = value_cmpf)
108
+
109
+ feat_v = score_tuples
110
+ for i in 0...feat_v.size
111
+ feat_v[i] = score_tuples[i][0]
112
+ end
113
+
114
+ puts("fsc done")
115
+ return score_dict,feat_v
116
+ end
117
+
118
+
119
+
120
+ ### select features and return new data
121
+ def select(sample, feat_v)
122
+ new_samp = []
123
+
124
+ feat_v.sort()
125
+
126
+ #for each sample
127
+ sample.each do |key, s| #NOTE: Extremely doubtful conversion
128
+ point = Hash.new
129
+ #for each feature to select
130
+ feat_v.each do |f|
131
+ if s[f]
132
+ point[f]=s[f]
133
+ end
134
+ end
135
+ new_samp.push(point)
136
+ end
137
+ return new_samp
138
+ end
139
+
140
+
141
+ =begin
142
+ #TODO: Convert the following code
143
+
144
+ ### Do parameter searching (grid.py)
145
+ def train_svm(tr_file)
146
+ cmd = "#{gridpy_exe} #{tr_file}"
147
+ puts(cmd)
148
+ puts('Cross validation...')
149
+ std_out = Popen(cmd, shell = True, stdout = PIPE).stdout
150
+
151
+ line = ''
152
+ while 1:
153
+ last_line = line
154
+ line = std_out.readline()
155
+ if not line: break
156
+ c,g,rate = map(float,last_line.split())
157
+
158
+ print('Best c=%s, g=%s CV rate=%s' % (c,g,rate))
159
+
160
+ return c,g,rate
161
+
162
+ ### Given (C,g) and training/testing data,
163
+ ### return predicted labels
164
+ def predict(tr_label, tr_sample, c, g, test_label, test_sample, del_model=1, model_name=None):
165
+ global train_file
166
+ tr_file = train_file+".tr"
167
+ te_file = train_file+".te"
168
+ if model_name: model_file = model_name
169
+ else: model_file = "%s.model"%tr_file
170
+ out_file = "%s.o"%te_file
171
+
172
+ # train
173
+ writedata(tr_sample,tr_label,tr_file)
174
+ cmd = "%s -c %f -g %f %s %s" % (svmtrain_exe,c,g,tr_file,model_file)
175
+ os.system(cmd)
176
+
177
+ # test
178
+ writedata(test_sample,test_label,te_file)
179
+ cmd = "%s %s %s %s" % (svmpredict_exe, te_file,model_file,out_file )
180
+ print(cmd)
181
+ os.system(cmd)
182
+
183
+ # fill in pred_y
184
+ pred_y=[]
185
+ fp = open(out_file)
186
+ line = fp.readline()
187
+ while line:
188
+ pred_y.append( float(line) )
189
+ line = fp.readline()
190
+
191
+ rem_file(tr_file)
192
+ #rem_file("%s.out"%tr_file)
193
+ #rem_file("%s.png"%tr_file)
194
+ rem_file(te_file)
195
+ if del_model: rem_file(model_file)
196
+ fp.close()
197
+ rem_file(out_file)
198
+
199
+ return pred_y
200
+
201
+
202
+ def cal_acc(pred_y, real_y):
203
+ right = 0.0
204
+
205
+ for i in range(len(pred_y)):
206
+ if(pred_y[i] == real_y[i]): right += 1
207
+
208
+ print("ACC: %d/%d"%(right, len(pred_y)))
209
+ return right/len(pred_y)
210
+
211
+ ### balanced accuracy
212
+ def cal_bacc(pred_y, real_y):
213
+ p_right = 0.0
214
+ n_right = 0.0
215
+ p_num = 0
216
+ n_num = 0
217
+
218
+ size=len(pred_y)
219
+ for i in range(size):
220
+ if real_y[i] == 1:
221
+ p_num+=1
222
+ if real_y[i]==pred_y[i]: p_right+=1
223
+ else:
224
+ n_num+=1
225
+ if real_y[i]==pred_y[i]: n_right+=1
226
+
227
+ print([p_right,p_num,n_right,n_num])
228
+ writelog(" p_yes/p_num, n_yes/n_num: %d/%d , %d/%d\n"%(p_right,p_num,n_right,n_num))
229
+ if p_num==0: p_num=1
230
+ if n_num==0: n_num=1
231
+ return 0.5*( p_right/p_num + n_right/n_num )
232
+ =end
233
+
234
+ ##### Log related #####
235
+ def initlog(name)
236
+ @logname = name
237
+ logfile = File.open(@logname, "w").close
238
+ end
239
+
240
+ def writelog(str, vlevel = VERBOSE_MAX)
241
+ if vlevel > VERBOSE_ITER
242
+ logfile = File.open(@logname, "a")
243
+ logfile.print(str)
244
+ logfile.close
245
+ end
246
+ end
247
+
248
+ ###### svm data IO ######
249
+
250
+ def readdata(filename)
251
+ labels = Array.new
252
+ samples = Array.new
253
+ max_index = 0
254
+
255
+ f = File.open(filename)
256
+
257
+ f.each_line do |line|
258
+ line.chomp!
259
+ next if line[0] == "#"
260
+
261
+ elems = line.split(" ")
262
+ sample = Hash.new
263
+ label_read = false
264
+ elements.each do |e|
265
+ unless label_read
266
+ labels.push e.to_f
267
+ label_read = true
268
+ next
269
+ end
270
+
271
+ feature, value = e.split(":")
272
+ p0 = feature.chomp.to_i
273
+ p1 = value.chomp.to_f
274
+ sample[p0] = p1
275
+
276
+ max_index = p0 if p0 > max_index
277
+
278
+ samples.push(sample)
279
+ end
280
+ end
281
+
282
+ f.close
283
+
284
+ return labels, samples, max_index
285
+ end
286
+
287
+ def writedata(samples, labels, filename)
288
+ fp = $stdout
289
+ if filename
290
+ fp = File.open(filename, "w")
291
+ end
292
+
293
+ num = samples.size
294
+ samples.each_index do |i|
295
+ if labels
296
+ fp.print label[i]
297
+ else
298
+ fp.print "0"
299
+ end
300
+ samples[i].keys.sort.each do |k|
301
+ fp.print(" #{k}:#{samples[i][k]}")
302
+ end
303
+ fp.puts ""
304
+ end
305
+ fp.close
306
+ end
307
+
308
+ ###### PROGRAM ENTRY POINT ######
309
+
310
+ arg_process()
311
+
312
+ initlog("#{@train_file}.select")
313
+ writelog("start: #{Time.now}\n\n")
314
+ main()
315
+
316
+ # do testing on all possible feature sets
317
+ if if_predict_all
318
+ predict_all()
319
+ end
320
+
321
+ writelog("\nend: \n#{Time.now}\n")