crosslanguagespotter 0.0.2-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/Gemfile +3 -0
- data/Rakefile +13 -0
- data/crosslanguagespotter.gemspec +36 -0
- data/examples/ex1.rb +13 -0
- data/examples/services_example.rb +13 -0
- data/lib/crosslanguagespotter/basic.rb +157 -0
- data/lib/crosslanguagespotter/context.rb +139 -0
- data/lib/crosslanguagespotter/figures_evaluator.rb +160 -0
- data/lib/crosslanguagespotter/jaccard.rb +114 -0
- data/lib/crosslanguagespotter/methods/context.rb +127 -0
- data/lib/crosslanguagespotter/methods/jaro.rb +118 -0
- data/lib/crosslanguagespotter/methods/tversky.rb +44 -0
- data/lib/crosslanguagespotter/model_loading.rb +333 -0
- data/lib/crosslanguagespotter/oracle.rb +261 -0
- data/lib/crosslanguagespotter/report.rb +88 -0
- data/lib/crosslanguagespotter/version.rb +5 -0
- data/lib/crosslanguagespotter/wekaintegration.rb +83 -0
- data/lib/crosslanguagespotter.rb +7 -0
- data/lib/jars/weka.jar +0 -0
- data/resources/css/bootstrap-theme.css +346 -0
- data/resources/css/bootstrap-theme.min.css +7 -0
- data/resources/css/bootstrap.css +5780 -0
- data/resources/css/bootstrap.min.css +7 -0
- data/resources/css/highlightstyles/arta.css +160 -0
- data/resources/css/highlightstyles/ascetic.css +50 -0
- data/resources/css/highlightstyles/atelier-dune.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-dune.light.css +93 -0
- data/resources/css/highlightstyles/atelier-forest.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-forest.light.css +93 -0
- data/resources/css/highlightstyles/atelier-heath.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-heath.light.css +93 -0
- data/resources/css/highlightstyles/atelier-lakeside.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-lakeside.light.css +93 -0
- data/resources/css/highlightstyles/atelier-seaside.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-seaside.light.css +93 -0
- data/resources/css/highlightstyles/brown_paper.css +105 -0
- data/resources/css/highlightstyles/brown_papersq.png +0 -0
- data/resources/css/highlightstyles/dark.css +105 -0
- data/resources/css/highlightstyles/default.css +153 -0
- data/resources/css/highlightstyles/docco.css +132 -0
- data/resources/css/highlightstyles/far.css +113 -0
- data/resources/css/highlightstyles/foundation.css +133 -0
- data/resources/css/highlightstyles/github.css +125 -0
- data/resources/css/highlightstyles/googlecode.css +147 -0
- data/resources/css/highlightstyles/idea.css +122 -0
- data/resources/css/highlightstyles/ir_black.css +105 -0
- data/resources/css/highlightstyles/magula.css +123 -0
- data/resources/css/highlightstyles/mono-blue.css +62 -0
- data/resources/css/highlightstyles/monokai.css +127 -0
- data/resources/css/highlightstyles/monokai_sublime.css +149 -0
- data/resources/css/highlightstyles/obsidian.css +154 -0
- data/resources/css/highlightstyles/paraiso.dark.css +93 -0
- data/resources/css/highlightstyles/paraiso.light.css +93 -0
- data/resources/css/highlightstyles/pojoaque.css +106 -0
- data/resources/css/highlightstyles/pojoaque.jpg +0 -0
- data/resources/css/highlightstyles/railscasts.css +182 -0
- data/resources/css/highlightstyles/rainbow.css +112 -0
- data/resources/css/highlightstyles/school_book.css +113 -0
- data/resources/css/highlightstyles/school_book.png +0 -0
- data/resources/css/highlightstyles/solarized_dark.css +107 -0
- data/resources/css/highlightstyles/solarized_light.css +107 -0
- data/resources/css/highlightstyles/sunburst.css +160 -0
- data/resources/css/highlightstyles/tomorrow-night-blue.css +93 -0
- data/resources/css/highlightstyles/tomorrow-night-bright.css +92 -0
- data/resources/css/highlightstyles/tomorrow-night-eighties.css +92 -0
- data/resources/css/highlightstyles/tomorrow-night.css +93 -0
- data/resources/css/highlightstyles/tomorrow.css +90 -0
- data/resources/css/highlightstyles/vs.css +89 -0
- data/resources/css/highlightstyles/xcode.css +158 -0
- data/resources/css/highlightstyles/zenburn.css +117 -0
- data/resources/example.html +1501 -0
- data/resources/js/bootstrap.js +1943 -0
- data/resources/js/bootstrap.min.js +7 -0
- data/resources/js/highlight.pack.js +1 -0
- data/resources/services_example.html +141 -0
- data/resources/template.html +61 -0
- data/test/data/angular-puzzle.GS +111 -0
- data/test/data/angular_puzzle/app.js +66 -0
- data/test/data/angular_puzzle/index.html +67 -0
- data/test/data/angular_puzzle/slidingPuzzle.js +203 -0
- data/test/data/angular_puzzle/wordSearchPuzzle.js +270 -0
- data/test/data/example.html +5 -0
- data/test/data/example.js +4 -0
- data/test/data/services/index.html +33 -0
- data/test/data/services/script.js +15 -0
- data/test/test_helper.rb +9 -0
- data/test/test_parsing.rb +23 -0
- data/test/test_spotter.rb +42 -0
- data/test/test_wekaintegration.rb +43 -0
- metadata +328 -0
@@ -0,0 +1,261 @@
|
|
1
|
+
require 'codemodels'
|
2
|
+
require 'codemodels/js'
|
3
|
+
require 'codemodels/html'
|
4
|
+
require 'csv'
|
5
|
+
require 'crosslanguagespotter/model_loading'
|
6
|
+
#require 'console'
|
7
|
+
#require 'code_processing'
|
8
|
+
|
9
|
+
include CodeModels
|
10
|
+
|
11
|
+
module CrossLanguageSpotter
|
12
|
+
|
13
|
+
OracleRelationEnd = Struct.new :file, :line, :col, :surface_form
|
14
|
+
MetaOracleRelationEnd = Struct.new :file, :index
|
15
|
+
|
16
|
+
class OracleLoader
|
17
|
+
|
18
|
+
def build_weka_classifier(srcpath,oraclepath)
|
19
|
+
features_data = to_train_data(srcpath,oraclepath)
|
20
|
+
data = []
|
21
|
+
features_data.each do |rel,row|
|
22
|
+
data.push(row)
|
23
|
+
end
|
24
|
+
keys = {
|
25
|
+
shared_length: :numeric,
|
26
|
+
tfidf_shared: :numeric,
|
27
|
+
itfidf_shared: :numeric,
|
28
|
+
perc_shared_length_min: :numeric,
|
29
|
+
perc_shared_length_max: :numeric,
|
30
|
+
diff_min: :numeric,
|
31
|
+
diff_max: :numeric,
|
32
|
+
perc_diff_min: :numeric,
|
33
|
+
perc_diff_max: :numeric,
|
34
|
+
context: :numeric,
|
35
|
+
jaccard: :numeric,
|
36
|
+
jaro: :numeric,
|
37
|
+
tversky: :numeric,
|
38
|
+
result: :boolean
|
39
|
+
}
|
40
|
+
train_instances = hash2weka_instances("oracle",data,keys,:result)
|
41
|
+
WekaClassifier.new(train_instances)
|
42
|
+
end
|
43
|
+
|
44
|
+
def to_train_data(srcpath,oraclepath)
|
45
|
+
project = Project.new(srcpath)
|
46
|
+
|
47
|
+
spotter = Spotter.new
|
48
|
+
features = spotter.features_for_project(project)
|
49
|
+
|
50
|
+
@file_lines = Hash.new do |h,k|
|
51
|
+
h[k] = File.readlines(k)
|
52
|
+
end
|
53
|
+
|
54
|
+
ok_a = ok_b = ko_a = ko_b = 0
|
55
|
+
|
56
|
+
train_data = {}
|
57
|
+
File.open(oraclepath,'r').each_with_index do |input_line,l|
|
58
|
+
input_line.strip!
|
59
|
+
unless input_line.start_with?('#')
|
60
|
+
values = input_line.split ":"
|
61
|
+
if values.count!=8
|
62
|
+
raise "Line #{l+1}, error: #{input_line}. Values: #{values}"
|
63
|
+
end
|
64
|
+
# we order them to facilitate searching for duplicates
|
65
|
+
end_a = OracleRelationEnd.new values[0], values[1].to_i, values[2].to_i, values[3]
|
66
|
+
end_b = OracleRelationEnd.new values[4], values[5].to_i, values[6].to_i, values[7]
|
67
|
+
if end_b.file < end_a.file
|
68
|
+
end_a, end_b = end_b, end_a
|
69
|
+
end
|
70
|
+
|
71
|
+
file_a = values[0]
|
72
|
+
line_a = values[1].to_i
|
73
|
+
col_a = values[2].to_i
|
74
|
+
surface_form_a = values[3]
|
75
|
+
file_b = values[4]
|
76
|
+
line_b = values[5].to_i
|
77
|
+
col_b = values[6].to_i
|
78
|
+
surface_form_b = values[7]
|
79
|
+
#if values[8]=='t'
|
80
|
+
# result = true
|
81
|
+
#elsif values[8]=='f'
|
82
|
+
# result = false
|
83
|
+
#else
|
84
|
+
# raise "Exptected true or false"
|
85
|
+
#end
|
86
|
+
|
87
|
+
#if oracle_values.values.include?([end_a,end_b])
|
88
|
+
# raise "Line #{l+1} is a duplicate of line #{oracle_values.find {|k,v| v==[end_a,end_b]}}"
|
89
|
+
#else
|
90
|
+
# oracle_values[l] = [end_a,end_b]
|
91
|
+
#end
|
92
|
+
|
93
|
+
file_a = "#{srcpath}/#{file_a}"
|
94
|
+
file_b = "#{srcpath}/#{file_b}"
|
95
|
+
|
96
|
+
model_a = project.models[file_a]
|
97
|
+
model_b = project.models[file_b]
|
98
|
+
|
99
|
+
raise "Model not found for #{file_a}. Available: #{project.models.keys}" unless model_a
|
100
|
+
raise "Model not found for #{file_b}. Available: #{project.models.keys}" unless model_b
|
101
|
+
|
102
|
+
plain_col_a = convert_from_tabcolumn_to_plaincolumn(file_a,line_a,col_a)
|
103
|
+
plain_col_b = convert_from_tabcolumn_to_plaincolumn(file_b,line_b,col_b)
|
104
|
+
|
105
|
+
pos_a = SourcePosition.new(SourcePoint.new(line_a,plain_col_a),SourcePoint.new(line_a,plain_col_a+surface_form_a.length-1))
|
106
|
+
pos_b = SourcePosition.new(SourcePoint.new(line_b,plain_col_b),SourcePoint.new(line_b,plain_col_b+surface_form_b.length-1))
|
107
|
+
begin
|
108
|
+
node_a = find_node(model_a,surface_form_a,pos_a)
|
109
|
+
ok_a+=1
|
110
|
+
rescue Exception => e
|
111
|
+
ko_a+=1
|
112
|
+
puts "Line #{l+1}) problem with '#{surface_form_a}', file: #{file_a}, pos #{pos_a}: #{e}"
|
113
|
+
end
|
114
|
+
begin
|
115
|
+
node_b = find_node(model_b,surface_form_b,pos_b)
|
116
|
+
ok_b+=1
|
117
|
+
rescue Exception => e
|
118
|
+
ko_b+=1
|
119
|
+
puts "Line #{l+1}) problem with '#{surface_form_b}', file: #{file_b}, pos #{pos_b}: #{e}"
|
120
|
+
end
|
121
|
+
|
122
|
+
if node_a and node_b
|
123
|
+
trindex_a = traverse_index(node_a)
|
124
|
+
trindex_b = traverse_index(node_b)
|
125
|
+
|
126
|
+
metaoracle_end_a = MetaOracleRelationEnd.new file_a,trindex_a
|
127
|
+
metaoracle_end_b = MetaOracleRelationEnd.new file_b,trindex_b
|
128
|
+
if metaoracle_end_b.file < metaoracle_end_a.file
|
129
|
+
metaoracle_end_a, metaoracle_end_b = metaoracle_end_b, metaoracle_end_a
|
130
|
+
end
|
131
|
+
#if metaoracle_values.values.include?([metaoracle_end_a,metaoracle_end_b])
|
132
|
+
# raise "Line #{l+1} (#{[metaoracle_end_a,metaoracle_end_b]}) is a duplicate of line #{metaoracle_values.find {|k,v| v==[metaoracle_end_a,metaoracle_end_b]}}"
|
133
|
+
#else
|
134
|
+
# metaoracle_values[l+1] = [metaoracle_end_a,metaoracle_end_b]
|
135
|
+
#end
|
136
|
+
|
137
|
+
id_a = NodeId.from_node(node_a)
|
138
|
+
id_b = NodeId.from_node(node_b)
|
139
|
+
rel = CrossLanguageRelation.new([id_a,id_b])
|
140
|
+
f = features[rel]
|
141
|
+
raise "Unknown features for #{rel} (a:#{node_a.source.artifact(:absolute).filename} L#{node_a.source.position(:absolute).begin_line},b:#{node_b.source.artifact(:absolute).filename} L#{node_b.source.position(:absolute).begin_line})" unless f
|
142
|
+
entry = { result: true }
|
143
|
+
f.each do |k,v|
|
144
|
+
entry[k] = v
|
145
|
+
end
|
146
|
+
train_data[rel] = entry
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
# all the others are implicitly negative examples
|
152
|
+
project.iter_over_shared_ids_instances do |node_a,node_b|
|
153
|
+
id_a = NodeId.from_node(node_a)
|
154
|
+
id_b = NodeId.from_node(node_b)
|
155
|
+
rel = CrossLanguageRelation.new([id_a,id_b])
|
156
|
+
unless train_data.has_key?(rel)
|
157
|
+
f = features[rel]
|
158
|
+
entry = { result: false }
|
159
|
+
f.each do |k,v|
|
160
|
+
entry[k] = v
|
161
|
+
end
|
162
|
+
train_data[rel] = entry
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
pos = 0
|
167
|
+
neg = 0
|
168
|
+
train_data.each do |k,v|
|
169
|
+
if v[:result]
|
170
|
+
pos+=1
|
171
|
+
#puts v
|
172
|
+
else
|
173
|
+
neg+=1
|
174
|
+
end
|
175
|
+
end
|
176
|
+
return train_data
|
177
|
+
end
|
178
|
+
|
179
|
+
private
|
180
|
+
|
181
|
+
def candidates_included_in_all_the_others(candidates_in_correct_position)
|
182
|
+
candidates_in_correct_position.each do |small|
|
183
|
+
ok = true
|
184
|
+
candidates_in_correct_position.each do |big|
|
185
|
+
if small!=big
|
186
|
+
unless big.source.position.include?(small.source.position)
|
187
|
+
ok = false
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
191
|
+
return small if ok
|
192
|
+
end
|
193
|
+
nil
|
194
|
+
end
|
195
|
+
|
196
|
+
def verbose_msg(msg)
|
197
|
+
end
|
198
|
+
|
199
|
+
def find_node(model,surface_form,position)
|
200
|
+
verbose_msg "Looking for '#{surface_form}'"
|
201
|
+
candidates_in_correct_position = []
|
202
|
+
candidates_in_other_positions = []
|
203
|
+
max_embedding_level = -1
|
204
|
+
model.traverse(:also_foreign) do |n|
|
205
|
+
if n.collect_values_with_count.has_key?(surface_form)
|
206
|
+
if n.source.position(:absolute).include?(position)
|
207
|
+
if n.source.embedding_level>=max_embedding_level
|
208
|
+
if n.source.embedding_level>max_embedding_level
|
209
|
+
candidates_in_correct_position.clear
|
210
|
+
end
|
211
|
+
max_embedding_level = n.source.embedding_level
|
212
|
+
candidates_in_correct_position << n
|
213
|
+
end
|
214
|
+
else
|
215
|
+
candidates_in_other_positions << n
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
219
|
+
if candidates_in_correct_position.count!=1
|
220
|
+
smallest_candidate = candidates_included_in_all_the_others(candidates_in_correct_position)
|
221
|
+
unless smallest_candidate
|
222
|
+
puts "I did not find exactly once '#{surface_form}' at #{position}. I found it there #{candidates_in_correct_position.count} times (found elsewhere #{candidates_in_other_positions.count} times)"
|
223
|
+
|
224
|
+
|
225
|
+
candidates_in_other_positions.each do |wp|
|
226
|
+
puts " * #{wp.source.position(:absolute)}"
|
227
|
+
end
|
228
|
+
puts "Candidate in corresponding position:"
|
229
|
+
candidates_in_correct_position.each do |c|
|
230
|
+
puts " * #{c} (embedded? #{c.source.embedded?})"
|
231
|
+
end
|
232
|
+
raise "Candidates found in #{position} are #{candidates_in_correct_position.count}"
|
233
|
+
else
|
234
|
+
puts "More than one candidate, I pick up the smallest"
|
235
|
+
return smallest_candidate
|
236
|
+
end
|
237
|
+
end
|
238
|
+
candidates_in_correct_position[0]
|
239
|
+
end
|
240
|
+
|
241
|
+
# the given column is calculated counting 4 for each tab,
|
242
|
+
# while the output count just 1 also per tab
|
243
|
+
def convert_from_tabcolumn_to_plaincolumn(file,line_index,tabcol)
|
244
|
+
line = @file_lines[file][line_index-1]
|
245
|
+
tabcol_to_plaincol(line,tabcol)
|
246
|
+
end
|
247
|
+
|
248
|
+
def tabcol_to_plaincol(line,tabcol)
|
249
|
+
c = 0
|
250
|
+
i = 0
|
251
|
+
while c<tabcol
|
252
|
+
c+=((line[i]=="\t") ? 4 : 1)
|
253
|
+
i+=1
|
254
|
+
end
|
255
|
+
raise "error" unless c==tabcol
|
256
|
+
i
|
257
|
+
end
|
258
|
+
|
259
|
+
end
|
260
|
+
|
261
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require "codemodels"
|
4
|
+
require "codemodels/html"
|
5
|
+
require "codemodels/js"
|
6
|
+
require 'htmlentities'
|
7
|
+
require 'liquid'
|
8
|
+
|
9
|
+
module CrossLanguageSpotter
|
10
|
+
|
11
|
+
def _language_from_filename(filename)
|
12
|
+
if filename.end_with?('.html')
|
13
|
+
'html'
|
14
|
+
else
|
15
|
+
'javascript'
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def generate_report_file(relations,output)
|
20
|
+
files_content = Hash.new{|h,k| h[k]=File.readlines(k)}
|
21
|
+
template = Liquid::Template.parse(File.read('./resources/template.html'))
|
22
|
+
|
23
|
+
data = []
|
24
|
+
relations.each do |rel|
|
25
|
+
entry = {}
|
26
|
+
entry['filenameA'] = rel[:node_a_file]
|
27
|
+
entry['languageA'] = _language_from_filename(entry['filenameA'])
|
28
|
+
entry['srcfileA'] = _code(files_content,rel[:node_a_file],
|
29
|
+
rel[:node_a_begin_line]-1,rel[:node_a_end_line]-1,
|
30
|
+
rel[:node_a_begin_column],rel[:node_a_end_column])
|
31
|
+
entry['filenameB'] = rel[:node_b_file]
|
32
|
+
entry['languageB'] = _language_from_filename(entry['filenameB'])
|
33
|
+
entry['srcfileB'] = _code(files_content,rel[:node_b_file],
|
34
|
+
rel[:node_b_begin_line]-1,rel[:node_b_end_line]-1,
|
35
|
+
rel[:node_b_begin_column],rel[:node_b_end_column])
|
36
|
+
data << entry
|
37
|
+
end
|
38
|
+
|
39
|
+
File.open(output, 'w') {|f| f.write(template.render({"relations"=>data})) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def _code(files_content,filename,begin_line,end_line,begin_col,end_col)
|
43
|
+
code = ""
|
44
|
+
snippet_lines = _get_snippet_lines(files_content[filename],begin_line)
|
45
|
+
snippet_lines[:before].each do |l|
|
46
|
+
code += HTMLEntities.new.encode(l,:decimal)
|
47
|
+
end
|
48
|
+
snippet_lines[:lines].each do |l|
|
49
|
+
#l = l.gsub("\t",' ')
|
50
|
+
code += HTMLEntities.new.encode(l[0...(begin_col-1)],:decimal)
|
51
|
+
puts "<<<#{l[(begin_col-1)...end_col]}>>>"
|
52
|
+
code += '<span style="background-color:yellow;padding:2px">'+HTMLEntities.new.encode(l[(begin_col-1)...end_col],:decimal)+"</span>"
|
53
|
+
code += HTMLEntities.new.encode(l[end_col..-1],:decimal)
|
54
|
+
end
|
55
|
+
snippet_lines[:after].each do |l|
|
56
|
+
code += HTMLEntities.new.encode(l,:decimal)
|
57
|
+
end
|
58
|
+
code = _remove_extra_spaces(code)
|
59
|
+
code
|
60
|
+
end
|
61
|
+
|
62
|
+
def _get_snippet_lines(lines,line_index)
|
63
|
+
around = 5
|
64
|
+
start_line = [0,line_index-5].max
|
65
|
+
end_line = [lines.count-1,line_index+5].min
|
66
|
+
before = lines[start_line...line_index]
|
67
|
+
sel_lines = [lines[line_index]]
|
68
|
+
after = lines[(line_index+1)..(end_line)]
|
69
|
+
{before:before,lines:sel_lines,after:after}
|
70
|
+
end
|
71
|
+
|
72
|
+
def _number_of_spaces(s)
|
73
|
+
return 0 unless s.start_with?(' ')
|
74
|
+
1+_number_of_spaces(s[1..-1])
|
75
|
+
end
|
76
|
+
|
77
|
+
def _remove_extra_spaces(code,newline=" ")
|
78
|
+
lines = code.split(newline)
|
79
|
+
spaces = []
|
80
|
+
lines.each do |l|
|
81
|
+
spaces << _number_of_spaces(l)
|
82
|
+
end
|
83
|
+
extra_spaces = spaces.min
|
84
|
+
lines.each_with_index {|l,i| lines[i] = l[extra_spaces..-1]}
|
85
|
+
lines.join(newline)
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'java'
|
2
|
+
|
3
|
+
module CrossLanguageSpotter
|
4
|
+
|
5
|
+
def build_classifier(training_instances)
|
6
|
+
c = Java::weka::classifiers::trees::RandomTree.new
|
7
|
+
c.build_classifier(training_instances)
|
8
|
+
c
|
9
|
+
end
|
10
|
+
|
11
|
+
class WekaClassifier
|
12
|
+
|
13
|
+
def initialize(training_instances)
|
14
|
+
@weka_classifier = build_classifier(training_instances)
|
15
|
+
end
|
16
|
+
|
17
|
+
def classify(data_instances)
|
18
|
+
results = []
|
19
|
+
data_instances.enumerate_instances.each do |instance|
|
20
|
+
#puts "Classifying #{instance}"
|
21
|
+
r = @weka_classifier.classify_instance(instance)
|
22
|
+
#puts "Result: #{r} #{instance}"
|
23
|
+
results.push({result: r==0.0, instance: instance})
|
24
|
+
end
|
25
|
+
return results
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
def hash2weka_instances(name,data,keys,class_value)
|
31
|
+
boolean_values = Java::weka::core::FastVector.new
|
32
|
+
boolean_values.add_element("true")
|
33
|
+
boolean_values.add_element("false")
|
34
|
+
|
35
|
+
# fill attributes
|
36
|
+
attributes = Java::weka::core::FastVector.new
|
37
|
+
attributes_map = {}
|
38
|
+
attributes_indexes = {}
|
39
|
+
i = 0
|
40
|
+
keys.each do |k,v|
|
41
|
+
raise "Null key in keys: #{keys}" unless k
|
42
|
+
raise "Null value for key #{k} in keys: #{keys}" unless v!=nil
|
43
|
+
if v==:numeric
|
44
|
+
# creates a numeric attribute
|
45
|
+
a = Java::weka::core::Attribute.new(k.to_s)
|
46
|
+
elsif v==:boolean
|
47
|
+
a = Java::weka::core::Attribute.new(k.to_s,boolean_values)
|
48
|
+
else
|
49
|
+
raise "Unknown attribute type: #{v}"
|
50
|
+
end
|
51
|
+
attributes.add_element(a)
|
52
|
+
attributes_map[k] = a
|
53
|
+
attributes_indexes[k] = i
|
54
|
+
i+=1
|
55
|
+
end
|
56
|
+
instances = Java::weka::core::Instances.new name, attributes, data.count
|
57
|
+
|
58
|
+
# fill instances
|
59
|
+
data.each do |row|
|
60
|
+
instance = Java::weka::core::Instance.new keys.count
|
61
|
+
keys.each do |k,v|
|
62
|
+
a = attributes_map[k]
|
63
|
+
if v==:numeric
|
64
|
+
instance.setValue(a,row[k])
|
65
|
+
elsif v==:boolean
|
66
|
+
instance.setValue(a,row[k].to_s)
|
67
|
+
else
|
68
|
+
raise "Unknown attribute type: #{v}"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
instances.add(instance)
|
72
|
+
end
|
73
|
+
|
74
|
+
if class_value
|
75
|
+
instances.setClassIndex(attributes_indexes[class_value])
|
76
|
+
end
|
77
|
+
|
78
|
+
#puts instances.to_s
|
79
|
+
|
80
|
+
return instances
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
data/lib/jars/weka.jar
ADDED
Binary file
|