eluka 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. data/.document +5 -0
  2. data/DOCUMENTATION_STANDARDS +39 -0
  3. data/Gemfile +13 -0
  4. data/Gemfile.lock +20 -0
  5. data/LICENSE.txt +20 -0
  6. data/README.rdoc +19 -0
  7. data/Rakefile +69 -0
  8. data/VERSION +1 -0
  9. data/examples/example.rb +59 -0
  10. data/ext/libsvm/COPYRIGHT +31 -0
  11. data/ext/libsvm/FAQ.html +1749 -0
  12. data/ext/libsvm/Makefile +25 -0
  13. data/ext/libsvm/Makefile.win +33 -0
  14. data/ext/libsvm/README +733 -0
  15. data/ext/libsvm/extconf.rb +1 -0
  16. data/ext/libsvm/heart_scale +270 -0
  17. data/ext/libsvm/java/Makefile +25 -0
  18. data/ext/libsvm/java/libsvm.jar +0 -0
  19. data/ext/libsvm/java/libsvm/svm.java +2776 -0
  20. data/ext/libsvm/java/libsvm/svm.m4 +2776 -0
  21. data/ext/libsvm/java/libsvm/svm_model.java +21 -0
  22. data/ext/libsvm/java/libsvm/svm_node.java +6 -0
  23. data/ext/libsvm/java/libsvm/svm_parameter.java +47 -0
  24. data/ext/libsvm/java/libsvm/svm_print_interface.java +5 -0
  25. data/ext/libsvm/java/libsvm/svm_problem.java +7 -0
  26. data/ext/libsvm/java/svm_predict.java +163 -0
  27. data/ext/libsvm/java/svm_scale.java +350 -0
  28. data/ext/libsvm/java/svm_toy.java +471 -0
  29. data/ext/libsvm/java/svm_train.java +318 -0
  30. data/ext/libsvm/java/test_applet.html +1 -0
  31. data/ext/libsvm/python/Makefile +4 -0
  32. data/ext/libsvm/python/README +331 -0
  33. data/ext/libsvm/python/svm.py +259 -0
  34. data/ext/libsvm/python/svmutil.py +242 -0
  35. data/ext/libsvm/svm-predict.c +226 -0
  36. data/ext/libsvm/svm-scale.c +353 -0
  37. data/ext/libsvm/svm-toy/gtk/Makefile +22 -0
  38. data/ext/libsvm/svm-toy/gtk/callbacks.cpp +423 -0
  39. data/ext/libsvm/svm-toy/gtk/callbacks.h +54 -0
  40. data/ext/libsvm/svm-toy/gtk/interface.c +164 -0
  41. data/ext/libsvm/svm-toy/gtk/interface.h +14 -0
  42. data/ext/libsvm/svm-toy/gtk/main.c +23 -0
  43. data/ext/libsvm/svm-toy/gtk/svm-toy.glade +238 -0
  44. data/ext/libsvm/svm-toy/qt/Makefile +17 -0
  45. data/ext/libsvm/svm-toy/qt/svm-toy.cpp +413 -0
  46. data/ext/libsvm/svm-toy/windows/svm-toy.cpp +456 -0
  47. data/ext/libsvm/svm-train.c +376 -0
  48. data/ext/libsvm/svm.cpp +3060 -0
  49. data/ext/libsvm/svm.def +19 -0
  50. data/ext/libsvm/svm.h +105 -0
  51. data/ext/libsvm/svm.o +0 -0
  52. data/ext/libsvm/tools/README +149 -0
  53. data/ext/libsvm/tools/checkdata.py +108 -0
  54. data/ext/libsvm/tools/easy.py +79 -0
  55. data/ext/libsvm/tools/grid.py +359 -0
  56. data/ext/libsvm/tools/subset.py +146 -0
  57. data/ext/libsvm/windows/libsvm.dll +0 -0
  58. data/ext/libsvm/windows/svm-predict.exe +0 -0
  59. data/ext/libsvm/windows/svm-scale.exe +0 -0
  60. data/ext/libsvm/windows/svm-toy.exe +0 -0
  61. data/ext/libsvm/windows/svm-train.exe +0 -0
  62. data/lib/eluka.rb +10 -0
  63. data/lib/eluka/bijection.rb +23 -0
  64. data/lib/eluka/data_point.rb +36 -0
  65. data/lib/eluka/document.rb +47 -0
  66. data/lib/eluka/feature_vector.rb +86 -0
  67. data/lib/eluka/features.rb +31 -0
  68. data/lib/eluka/model.rb +129 -0
  69. data/lib/fselect.rb +321 -0
  70. data/lib/grid.rb +25 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_eluka.rb +7 -0
  73. metadata +214 -0
Binary file
Binary file
Binary file
Binary file
Binary file
data/lib/eluka.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'ferret'
3
+
4
+ require 'eluka/features'
5
+ require 'eluka/document'
6
+ require 'eluka/feature_vector'
7
+ require 'eluka/model'
8
+ require 'eluka/bijection'
9
+ require 'eluka/data_point'
10
+
@@ -0,0 +1,23 @@
1
+ #Creates a two way hash like lookup using two hashes
2
+ #Source inspired from a post in some forum
3
+ #Author: Unknown
4
+ module Eluka
5
+ class Bijection < Hash
6
+ def initialize(*args)
7
+ super(*args)
8
+ @reverse = self.invert
9
+ end
10
+
11
+ def []=(key, val)
12
+ super(key, val)
13
+ if @reverse.has_key?(val)
14
+ self.delete(@reverse[val])
15
+ end
16
+ @reverse[val] = key
17
+ end
18
+
19
+ def lookup(val)
20
+ @reverse[val]
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,36 @@
1
+ module Eluka
2
+ class DataPoint
3
+
4
+ def initialize(data, analyzer)
5
+ raise "Can't find any data" unless (data)
6
+
7
+ if data.instance_of?(String)
8
+ data = {:text => data}
9
+ end
10
+
11
+ raise "Invalid data added" unless (data.instance_of? Hash)
12
+ raise "Data can't be empty" unless (data.size > 0)
13
+
14
+ @data = data
15
+ @analyzer = analyzer
16
+ end
17
+
18
+ def vector
19
+ vector = Hash.new
20
+
21
+ @data.each do |field, value|
22
+ if value.instance_of?(String) then
23
+ doc_vec = Eluka::Document.new(field, value, @analyzer).vector
24
+ vector.merge!(doc_vec)
25
+ elsif value.instance_of?(Fixnum) or value.instance_of?(Float)
26
+ vector[field] = value
27
+ else
28
+ raise "A field can contain either an integer or a double or it can be a string"
29
+ end
30
+ end
31
+
32
+ vector
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,47 @@
1
+
2
+ module Eluka
3
+
4
+ class Document
5
+ def initialize(field, text, analyzer)
6
+ @field = field
7
+ @text = text
8
+ @analyzer = analyzer
9
+ @bag_of_words = nil
10
+ self.bag_of_words
11
+ end
12
+
13
+ def bag_of_words
14
+ #Position counter for the document
15
+ pos = 0
16
+
17
+ @bag_of_words = Hash.new
18
+
19
+ #Token Stream
20
+ token_stream = @analyzer.token_stream(:field, @text)
21
+ while token = token_stream.next do
22
+ pos += token.pos_inc
23
+
24
+ @bag_of_words[token.text] = Array.new unless @bag_of_words[token.text]
25
+ @bag_of_words[token.text].push(pos)
26
+ end
27
+
28
+ end
29
+
30
+ def vector
31
+ vector = Hash.new
32
+ squared_length = 0
33
+ @bag_of_words.each do |term, pos_vector|
34
+ squared_length += pos_vector.size**2
35
+ #vector[[@field,term].join("||")] = pos_vector.size
36
+ end
37
+
38
+ length = squared_length.to_f**0.5
39
+ @bag_of_words.each do |term, pos_vector|
40
+ vector[[@field,term].join("||")] = pos_vector.size.to_f / length
41
+ end
42
+
43
+ vector
44
+ end
45
+ end
46
+
47
+ end
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Feature Vectors performs two important functions
4
+ #
5
+ # 1. Maintain a list of <feature, value> pairs for each data point (vector)
6
+ # so that a model can be built whenever needed. (Sparse representation)
7
+ #
8
+ # 2. Adds every feature to the Features object to maintain
9
+ # a unique list of features
10
+ #
11
+ # TODO: On disk representation for large training data
12
+ module Eluka
13
+
14
+ class FeatureVectors
15
+
16
+ # Feature Vectors for a data point need to know the
17
+ # global list of features and their respective ids
18
+ #
19
+ # During training, as we keep finding new features
20
+ # we add them to the features list
21
+ #
22
+ # Hence we need to know whether the vectors we are computing
23
+ # are for training or classification
24
+
25
+ def initialize (features, train)
26
+ @fvs = Array.new
27
+ @features = features #Instance of features
28
+ @train = train #Boolean
29
+ end
30
+
31
+ # We just keep all data points stored and convert them to
32
+ # feature vectors only on demand
33
+
34
+ def add (vector, label = 0)
35
+ @fvs.push([vector, label])
36
+ end
37
+
38
+ # For training data points we make sure all the features
39
+ # are added to the feature list
40
+
41
+ def define_features
42
+ @fvs.each do |vector, label|
43
+ vector.each do |term, value|
44
+ @features.add(term)
45
+ end
46
+ end
47
+ end
48
+
49
+ # Creates feature vectors and converts them to
50
+ # LibSVM format -- a multiline string with one
51
+ # data point per line
52
+ #
53
+ # If provided with a list of selected features then
54
+ # insert only those features
55
+
56
+ def to_libSVM (sel_features = nil)
57
+
58
+ #Load the selected features into a Hash
59
+ sf = Hash.new
60
+ if (sel_features)
61
+ sel_features.each do |f|
62
+ sf[f] = 1
63
+ end
64
+ end
65
+
66
+ self.define_features if (@train) #This method is needed only for training data
67
+
68
+ output = Array.new
69
+ @fvs.each do |vector, label|
70
+ line = Array.new
71
+ line.push(label)
72
+
73
+ (1..@features.f_count).each do |id| #OPTIMIZE: Change this line to consider sorting in case of terms being features
74
+ term = @features.term(id)
75
+ if ( value = vector[term] ) then
76
+ line.push([id, value].join(":")) if sf[term] or not sel_features
77
+ end
78
+ end
79
+ output.push(line.join(" "))
80
+ end
81
+ output.join("\n")
82
+ end
83
+
84
+ end
85
+
86
+ end
@@ -0,0 +1,31 @@
1
+
2
+ module Eluka
3
+
4
+ class Features
5
+ def initialize
6
+ @features = Eluka::Bijection.new
7
+ @f_count = 0
8
+ end
9
+
10
+ attr_reader :f_count
11
+
12
+ def add (term)
13
+ unless @features[term] then
14
+ @f_count += 1
15
+ @features[term] = @f_count
16
+ end
17
+ return @features[term]
18
+ end
19
+
20
+ def id (term)
21
+ @features[term]
22
+ end
23
+
24
+ def term (id)
25
+ @features.lookup(id)
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+
@@ -0,0 +1,129 @@
1
+
2
+ module Eluka
3
+
4
+ #A binary classifier classifies data into two classes given a category (the class label)
5
+ # a. Data which is indicative of the category -- positive data
6
+ # b. Data which is not indicative of the category -- negative data
7
+ #
8
+ #== Model
9
+ # A classifier model observes positve and negative data and learns the properties of
10
+ #each set. In the future if given an unlabelled data point it decides whether the
11
+ #the data point is a positive or negative instance of the category.
12
+ #
13
+ #=== Internal Data Representation
14
+ # A classifier model internally represents a data instance as a point in a vector space
15
+ # The dimensions of the vector space are termed as features
16
+ #
17
+ #=== Eluka::Model
18
+ # An Eluka model takes a hash of features and their values and internally processes them
19
+ #as points in a vector space. If the input is a string of words like in a document then
20
+ #it relies on Ferret's text anaysis modules to convert it into a data point
21
+
22
+ class Model
23
+
24
+ include Ferret::Analysis
25
+
26
+ # Initialize the classifier with sane defaults
27
+ # if customised data is not provided
28
+
29
+ def initialize (params = {})
30
+ #Set the labels
31
+ @labels = Bijection.new
32
+ @labels[:positive] = 1
33
+ @labels[:negative] = -1
34
+ @labels[:unknown] = 0
35
+
36
+ @gem_root = File.expand_path(File.join(File.dirname(__FILE__), '..'))
37
+ @bin_dir = File.expand_path(File.join(File.dirname(@gem_root), 'bin'))
38
+
39
+ @analyzer = StandardAnalyzer.new
40
+ @features = Eluka::Features.new
41
+ @fv_train = Eluka::FeatureVectors.new(@features, true)
42
+ @fv_test = nil
43
+
44
+ @directory = (params[:directory] or "/tmp")
45
+ @svm_train_path = (params[:svm_train_path] or "#{@bin_dir}/eluka-svm-train")
46
+ @svm_scale_path = (params[:svm_scale_path] or "#{@bin_dir}/eluka-svm-scale")
47
+ @svm_predict_path = (params[:svm_predict_path] or "#{@bin_dir}/eluka-svm-predict")
48
+ @grid_py_path = (params[:grid_py_path] or "python rsvm/tools/grid.py")
49
+ @fselect_py_path = (params[:fselect_py_path] or "python rsvm/tools/fselect.py")
50
+ @verbose = (params[:verbose] or false)
51
+
52
+ #Convert directory to absolute path
53
+ Dir.chdir(@directory) do @directory = Dir.pwd end
54
+ end
55
+
56
+ # Add a data point to the training data
57
+
58
+ def add (data, label)
59
+ raise "No meaningful label associated with data" unless ([:positive, :negative].include? label)
60
+
61
+ data_point = Eluka::DataPoint.new(data, @analyzer)
62
+ @fv_train.add(data_point.vector, @labels[label])
63
+ end
64
+
65
+ # Build a model from the training data using LibSVM
66
+
67
+ def build (features = nil)
68
+ File.open(@directory + "/train", "w") do |f| f.puts @fv_train.to_libSVM(features) end
69
+
70
+ output = `#{@svm_train_path} #{@directory}/train #{@directory}/model`
71
+
72
+ puts output if (@verbose)
73
+
74
+ @fv_test = Eluka::FeatureVectors.new(@features, false)
75
+ return output
76
+ end
77
+
78
+ # Classify a data point
79
+
80
+ def classify (data, features = nil)
81
+ raise "Untrained model" unless (@fv_test)
82
+
83
+ data_point = Eluka::DataPoint.new(data, @analyzer)
84
+ @fv_test.add(data_point.vector)
85
+
86
+ File.open(@directory + "/classify", "w") do |f| f.puts @fv_test.to_libSVM(features) end
87
+ output = `#{@svm_predict_path} #{@directory}/classify #{@directory}/model #{@directory}/result`
88
+
89
+ puts output if (@verbose)
90
+
91
+ return @labels.lookup( File.open( @directory + "/result", "r" ).read.to_i )
92
+ end
93
+
94
+ # Suggests the best set of features chosen using fselect.py
95
+ # IMPROVE: Depending on fselect.py (an unnecessary python dependency) is stupid
96
+ # TODO: Finish wirting fselect.rb and integrate it
97
+
98
+ def suggest_features
99
+ sel_features = Array.new
100
+
101
+ File.open(@directory + "/train", "w") do |f| f.puts @fv_train.to_libSVM end
102
+
103
+ Dir.chdir('./rsvm/bin/tools') do
104
+ output = `python fselect.py #{@directory}/train`
105
+
106
+ puts output if (@verbose)
107
+
108
+ x = File.read("train.select")
109
+ sel_f_ids = x[1..-2].split(", ")
110
+ sel_f_ids.each do |f|
111
+ s_f = @features.term(f.to_i)
112
+ if s_f.instance_of? String then
113
+ s_f = s_f.split("||")
114
+ s_f[0] = s_f[0].to_sym
115
+ end
116
+ sel_features.push(s_f)
117
+ end
118
+
119
+ #Remove temporary files
120
+ File.delete("train.select") if File.exist?("train.select")
121
+ File.delete("train.fscore") if File.exist?("train.fscore")
122
+ File.delete("train.tr.out") if File.exist?("train.tr.out")
123
+ end
124
+
125
+ return sel_features
126
+ end
127
+ end
128
+
129
+ end
data/lib/fselect.rb ADDED
@@ -0,0 +1,321 @@
1
+ #trying to convert fselect.py method by method into ruby
2
+ require 'rbconfig'
3
+
4
+ ##### Path Setting #####
5
+ is_win32 = (Config::CONFIG["host_os"] == 'win32')
6
+ unless is_win32
7
+ gridpy_exe = "./grid.py -log2c -2,9,2 -log2g 1,-11,-2"
8
+ svmtrain_exe = "../svm-train"
9
+ svmpredict_exe = "../svm-predict"
10
+ else
11
+ gridpy_exe = ".\\grid.py -log2c -2,9,2 -log2g 1,-11,-2"
12
+ svmtrain_exe = "..\\windows\\svmtrain.exe"
13
+ svmpredict_exe = "..\\windows\\svmpredict.exe"
14
+ end
15
+
16
+ ##### Global Variables #####
17
+
18
+ @train_pathfile=""
19
+ @train_file=""
20
+ @test_pathfile=""
21
+ @test_file=""
22
+ @if_predict_all=0
23
+
24
+ @whole_fsc_dict={}
25
+ @whole_imp_v=[]
26
+
27
+ VERBOSE_MAX=100
28
+ VERBOSE_ITER = 3
29
+ VERBOSE_GRID_TIME = 2
30
+ VERBOSE_TIME = 1
31
+
32
+ def arg_process
33
+ unless (ARGV.size == 2 or ARGV.size == 3)
34
+ puts 'Usage: #{ARGV[0]} training_file [testing_file]'
35
+ exit
36
+ end
37
+
38
+ @train_pathfile = ARGV[1]
39
+ raise "training file not found" unless File.exist? @train_pathfile
40
+ @train_file = File.basename(@train_pathfile)
41
+
42
+ if ARGV.size == 3
43
+ @test_pathfile = ARGV[1]
44
+ raise "testing file not found" unless File.exist? @test_pathfile
45
+ @test_file = File.basename(@test_pathfile)
46
+ end
47
+ end
48
+
49
+
50
+ ##### Decide sizes of selected feautures #####
51
+
52
+ def feat_num_try_half(max_index)
53
+ v=[]
54
+ while max_index > 1 do
55
+ v.push(max_index)
56
+ max_index /= 2
57
+ end
58
+ return v
59
+ end
60
+
61
+ def feat_num_try(f_tuple)
62
+ for i in 0...f_tuple.size do
63
+ if f_tuple[i][1] < 1e-20
64
+ i = i - 1
65
+ break
66
+ end
67
+ end
68
+ #only take first eight numbers (>1%)
69
+ return feat_num_try_half(i+1)[0...8]
70
+ end
71
+
72
+ def random_shuffle(label, sample)
73
+ srand 1
74
+ size = label.size
75
+ for i in 0...label.size
76
+ ri = rand(size)
77
+ tmp = label[ri]
78
+ label[ri] = label[size-i-1]
79
+ label[size-i-1] = tmp
80
+ tmp = sample[ri]
81
+ sample[ri] = sample[size-i-1]
82
+ sample[size-i-1] = tmp
83
+ end
84
+ end
85
+
86
+
87
+ ### compare function used in list.sort(): sort by element[1]
88
+ #def value_cmpf(x,y):
89
+ # if x[1]>y[1]: return -1
90
+ # if x[1]<y[1]: return 1
91
+ # return 0
92
+
93
+ def value_cmpf(x)
94
+ return (-x[1])
95
+ end
96
+
97
+ ### cal importance of features
98
+ ### return fscore_dict and feat with desc order
99
+ def cal_feat_imp(label, sample)
100
+
101
+ puts("calculating fsc...")
102
+
103
+ score_dict = cal_Fscore(label, sample)
104
+
105
+ #NOTE: Convert the following two lines carefully
106
+ score_tuples = list(score_dict.items())
107
+ score_tuples.sort(key = value_cmpf)
108
+
109
+ feat_v = score_tuples
110
+ for i in 0...feat_v.size
111
+ feat_v[i] = score_tuples[i][0]
112
+ end
113
+
114
+ puts("fsc done")
115
+ return score_dict,feat_v
116
+ end
117
+
118
+
119
+
120
+ ### select features and return new data
121
+ def select(sample, feat_v)
122
+ new_samp = []
123
+
124
+ feat_v.sort()
125
+
126
+ #for each sample
127
+ sample.each do |key, s| #NOTE: Extremely doubtful conversion
128
+ point = Hash.new
129
+ #for each feature to select
130
+ feat_v.each do |f|
131
+ if s[f]
132
+ point[f]=s[f]
133
+ end
134
+ end
135
+ new_samp.push(point)
136
+ end
137
+ return new_samp
138
+ end
139
+
140
+
141
+ =begin
142
+ #TODO: Convert the following code
143
+
144
+ ### Do parameter searching (grid.py)
145
+ def train_svm(tr_file)
146
+ cmd = "#{gridpy_exe} #{tr_file}"
147
+ puts(cmd)
148
+ puts('Cross validation...')
149
+ std_out = Popen(cmd, shell = True, stdout = PIPE).stdout
150
+
151
+ line = ''
152
+ while 1:
153
+ last_line = line
154
+ line = std_out.readline()
155
+ if not line: break
156
+ c,g,rate = map(float,last_line.split())
157
+
158
+ print('Best c=%s, g=%s CV rate=%s' % (c,g,rate))
159
+
160
+ return c,g,rate
161
+
162
+ ### Given (C,g) and training/testing data,
163
+ ### return predicted labels
164
+ def predict(tr_label, tr_sample, c, g, test_label, test_sample, del_model=1, model_name=None):
165
+ global train_file
166
+ tr_file = train_file+".tr"
167
+ te_file = train_file+".te"
168
+ if model_name: model_file = model_name
169
+ else: model_file = "%s.model"%tr_file
170
+ out_file = "%s.o"%te_file
171
+
172
+ # train
173
+ writedata(tr_sample,tr_label,tr_file)
174
+ cmd = "%s -c %f -g %f %s %s" % (svmtrain_exe,c,g,tr_file,model_file)
175
+ os.system(cmd)
176
+
177
+ # test
178
+ writedata(test_sample,test_label,te_file)
179
+ cmd = "%s %s %s %s" % (svmpredict_exe, te_file,model_file,out_file )
180
+ print(cmd)
181
+ os.system(cmd)
182
+
183
+ # fill in pred_y
184
+ pred_y=[]
185
+ fp = open(out_file)
186
+ line = fp.readline()
187
+ while line:
188
+ pred_y.append( float(line) )
189
+ line = fp.readline()
190
+
191
+ rem_file(tr_file)
192
+ #rem_file("%s.out"%tr_file)
193
+ #rem_file("%s.png"%tr_file)
194
+ rem_file(te_file)
195
+ if del_model: rem_file(model_file)
196
+ fp.close()
197
+ rem_file(out_file)
198
+
199
+ return pred_y
200
+
201
+
202
+ def cal_acc(pred_y, real_y):
203
+ right = 0.0
204
+
205
+ for i in range(len(pred_y)):
206
+ if(pred_y[i] == real_y[i]): right += 1
207
+
208
+ print("ACC: %d/%d"%(right, len(pred_y)))
209
+ return right/len(pred_y)
210
+
211
+ ### balanced accuracy
212
+ def cal_bacc(pred_y, real_y):
213
+ p_right = 0.0
214
+ n_right = 0.0
215
+ p_num = 0
216
+ n_num = 0
217
+
218
+ size=len(pred_y)
219
+ for i in range(size):
220
+ if real_y[i] == 1:
221
+ p_num+=1
222
+ if real_y[i]==pred_y[i]: p_right+=1
223
+ else:
224
+ n_num+=1
225
+ if real_y[i]==pred_y[i]: n_right+=1
226
+
227
+ print([p_right,p_num,n_right,n_num])
228
+ writelog(" p_yes/p_num, n_yes/n_num: %d/%d , %d/%d\n"%(p_right,p_num,n_right,n_num))
229
+ if p_num==0: p_num=1
230
+ if n_num==0: n_num=1
231
+ return 0.5*( p_right/p_num + n_right/n_num )
232
+ =end
233
+
234
+ ##### Log related #####
235
+ def initlog(name)
236
+ @logname = name
237
+ logfile = File.open(@logname, "w").close
238
+ end
239
+
240
+ def writelog(str, vlevel = VERBOSE_MAX)
241
+ if vlevel > VERBOSE_ITER
242
+ logfile = File.open(@logname, "a")
243
+ logfile.print(str)
244
+ logfile.close
245
+ end
246
+ end
247
+
248
+ ###### svm data IO ######
249
+
250
+ def readdata(filename)
251
+ labels = Array.new
252
+ samples = Array.new
253
+ max_index = 0
254
+
255
+ f = File.open(filename)
256
+
257
+ f.each_line do |line|
258
+ line.chomp!
259
+ next if line[0] == "#"
260
+
261
+ elems = line.split(" ")
262
+ sample = Hash.new
263
+ label_read = false
264
+ elements.each do |e|
265
+ unless label_read
266
+ labels.push e.to_f
267
+ label_read = true
268
+ next
269
+ end
270
+
271
+ feature, value = e.split(":")
272
+ p0 = feature.chomp.to_i
273
+ p1 = value.chomp.to_f
274
+ sample[p0] = p1
275
+
276
+ max_index = p0 if p0 > max_index
277
+
278
+ samples.push(sample)
279
+ end
280
+ end
281
+
282
+ f.close
283
+
284
+ return labels, samples, max_index
285
+ end
286
+
287
+ def writedata(samples, labels, filename)
288
+ fp = $stdout
289
+ if filename
290
+ fp = File.open(filename, "w")
291
+ end
292
+
293
+ num = samples.size
294
+ samples.each_index do |i|
295
+ if labels
296
+ fp.print label[i]
297
+ else
298
+ fp.print "0"
299
+ end
300
+ samples[i].keys.sort.each do |k|
301
+ fp.print(" #{k}:#{samples[i][k]}")
302
+ end
303
+ fp.puts ""
304
+ end
305
+ fp.close
306
+ end
307
+
308
+ ###### PROGRAM ENTRY POINT ######
309
+
310
+ arg_process()
311
+
312
+ initlog("#{@train_file}.select")
313
+ writelog("start: #{Time.now}\n\n")
314
+ main()
315
+
316
+ # do testing on all possible feature sets
317
+ if if_predict_all
318
+ predict_all()
319
+ end
320
+
321
+ writelog("\nend: \n#{Time.now}\n")