crosslanguagespotter 0.0.2-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (92) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/Gemfile +3 -0
  4. data/Rakefile +13 -0
  5. data/crosslanguagespotter.gemspec +36 -0
  6. data/examples/ex1.rb +13 -0
  7. data/examples/services_example.rb +13 -0
  8. data/lib/crosslanguagespotter/basic.rb +157 -0
  9. data/lib/crosslanguagespotter/context.rb +139 -0
  10. data/lib/crosslanguagespotter/figures_evaluator.rb +160 -0
  11. data/lib/crosslanguagespotter/jaccard.rb +114 -0
  12. data/lib/crosslanguagespotter/methods/context.rb +127 -0
  13. data/lib/crosslanguagespotter/methods/jaro.rb +118 -0
  14. data/lib/crosslanguagespotter/methods/tversky.rb +44 -0
  15. data/lib/crosslanguagespotter/model_loading.rb +333 -0
  16. data/lib/crosslanguagespotter/oracle.rb +261 -0
  17. data/lib/crosslanguagespotter/report.rb +88 -0
  18. data/lib/crosslanguagespotter/version.rb +5 -0
  19. data/lib/crosslanguagespotter/wekaintegration.rb +83 -0
  20. data/lib/crosslanguagespotter.rb +7 -0
  21. data/lib/jars/weka.jar +0 -0
  22. data/resources/css/bootstrap-theme.css +346 -0
  23. data/resources/css/bootstrap-theme.min.css +7 -0
  24. data/resources/css/bootstrap.css +5780 -0
  25. data/resources/css/bootstrap.min.css +7 -0
  26. data/resources/css/highlightstyles/arta.css +160 -0
  27. data/resources/css/highlightstyles/ascetic.css +50 -0
  28. data/resources/css/highlightstyles/atelier-dune.dark.css +93 -0
  29. data/resources/css/highlightstyles/atelier-dune.light.css +93 -0
  30. data/resources/css/highlightstyles/atelier-forest.dark.css +93 -0
  31. data/resources/css/highlightstyles/atelier-forest.light.css +93 -0
  32. data/resources/css/highlightstyles/atelier-heath.dark.css +93 -0
  33. data/resources/css/highlightstyles/atelier-heath.light.css +93 -0
  34. data/resources/css/highlightstyles/atelier-lakeside.dark.css +93 -0
  35. data/resources/css/highlightstyles/atelier-lakeside.light.css +93 -0
  36. data/resources/css/highlightstyles/atelier-seaside.dark.css +93 -0
  37. data/resources/css/highlightstyles/atelier-seaside.light.css +93 -0
  38. data/resources/css/highlightstyles/brown_paper.css +105 -0
  39. data/resources/css/highlightstyles/brown_papersq.png +0 -0
  40. data/resources/css/highlightstyles/dark.css +105 -0
  41. data/resources/css/highlightstyles/default.css +153 -0
  42. data/resources/css/highlightstyles/docco.css +132 -0
  43. data/resources/css/highlightstyles/far.css +113 -0
  44. data/resources/css/highlightstyles/foundation.css +133 -0
  45. data/resources/css/highlightstyles/github.css +125 -0
  46. data/resources/css/highlightstyles/googlecode.css +147 -0
  47. data/resources/css/highlightstyles/idea.css +122 -0
  48. data/resources/css/highlightstyles/ir_black.css +105 -0
  49. data/resources/css/highlightstyles/magula.css +123 -0
  50. data/resources/css/highlightstyles/mono-blue.css +62 -0
  51. data/resources/css/highlightstyles/monokai.css +127 -0
  52. data/resources/css/highlightstyles/monokai_sublime.css +149 -0
  53. data/resources/css/highlightstyles/obsidian.css +154 -0
  54. data/resources/css/highlightstyles/paraiso.dark.css +93 -0
  55. data/resources/css/highlightstyles/paraiso.light.css +93 -0
  56. data/resources/css/highlightstyles/pojoaque.css +106 -0
  57. data/resources/css/highlightstyles/pojoaque.jpg +0 -0
  58. data/resources/css/highlightstyles/railscasts.css +182 -0
  59. data/resources/css/highlightstyles/rainbow.css +112 -0
  60. data/resources/css/highlightstyles/school_book.css +113 -0
  61. data/resources/css/highlightstyles/school_book.png +0 -0
  62. data/resources/css/highlightstyles/solarized_dark.css +107 -0
  63. data/resources/css/highlightstyles/solarized_light.css +107 -0
  64. data/resources/css/highlightstyles/sunburst.css +160 -0
  65. data/resources/css/highlightstyles/tomorrow-night-blue.css +93 -0
  66. data/resources/css/highlightstyles/tomorrow-night-bright.css +92 -0
  67. data/resources/css/highlightstyles/tomorrow-night-eighties.css +92 -0
  68. data/resources/css/highlightstyles/tomorrow-night.css +93 -0
  69. data/resources/css/highlightstyles/tomorrow.css +90 -0
  70. data/resources/css/highlightstyles/vs.css +89 -0
  71. data/resources/css/highlightstyles/xcode.css +158 -0
  72. data/resources/css/highlightstyles/zenburn.css +117 -0
  73. data/resources/example.html +1501 -0
  74. data/resources/js/bootstrap.js +1943 -0
  75. data/resources/js/bootstrap.min.js +7 -0
  76. data/resources/js/highlight.pack.js +1 -0
  77. data/resources/services_example.html +141 -0
  78. data/resources/template.html +61 -0
  79. data/test/data/angular-puzzle.GS +111 -0
  80. data/test/data/angular_puzzle/app.js +66 -0
  81. data/test/data/angular_puzzle/index.html +67 -0
  82. data/test/data/angular_puzzle/slidingPuzzle.js +203 -0
  83. data/test/data/angular_puzzle/wordSearchPuzzle.js +270 -0
  84. data/test/data/example.html +5 -0
  85. data/test/data/example.js +4 -0
  86. data/test/data/services/index.html +33 -0
  87. data/test/data/services/script.js +15 -0
  88. data/test/test_helper.rb +9 -0
  89. data/test/test_parsing.rb +23 -0
  90. data/test/test_spotter.rb +42 -0
  91. data/test/test_wekaintegration.rb +43 -0
  92. metadata +328 -0
@@ -0,0 +1,261 @@
1
+ require 'codemodels'
2
+ require 'codemodels/js'
3
+ require 'codemodels/html'
4
+ require 'csv'
5
+ require 'crosslanguagespotter/model_loading'
6
+ #require 'console'
7
+ #require 'code_processing'
8
+
9
+ include CodeModels
10
+
11
+ module CrossLanguageSpotter
12
+
13
+ OracleRelationEnd = Struct.new :file, :line, :col, :surface_form
14
+ MetaOracleRelationEnd = Struct.new :file, :index
15
+
16
+ class OracleLoader
17
+
18
+ def build_weka_classifier(srcpath,oraclepath)
19
+ features_data = to_train_data(srcpath,oraclepath)
20
+ data = []
21
+ features_data.each do |rel,row|
22
+ data.push(row)
23
+ end
24
+ keys = {
25
+ shared_length: :numeric,
26
+ tfidf_shared: :numeric,
27
+ itfidf_shared: :numeric,
28
+ perc_shared_length_min: :numeric,
29
+ perc_shared_length_max: :numeric,
30
+ diff_min: :numeric,
31
+ diff_max: :numeric,
32
+ perc_diff_min: :numeric,
33
+ perc_diff_max: :numeric,
34
+ context: :numeric,
35
+ jaccard: :numeric,
36
+ jaro: :numeric,
37
+ tversky: :numeric,
38
+ result: :boolean
39
+ }
40
+ train_instances = hash2weka_instances("oracle",data,keys,:result)
41
+ WekaClassifier.new(train_instances)
42
+ end
43
+
44
+ def to_train_data(srcpath,oraclepath)
45
+ project = Project.new(srcpath)
46
+
47
+ spotter = Spotter.new
48
+ features = spotter.features_for_project(project)
49
+
50
+ @file_lines = Hash.new do |h,k|
51
+ h[k] = File.readlines(k)
52
+ end
53
+
54
+ ok_a = ok_b = ko_a = ko_b = 0
55
+
56
+ train_data = {}
57
+ File.open(oraclepath,'r').each_with_index do |input_line,l|
58
+ input_line.strip!
59
+ unless input_line.start_with?('#')
60
+ values = input_line.split ":"
61
+ if values.count!=8
62
+ raise "Line #{l+1}, error: #{input_line}. Values: #{values}"
63
+ end
64
+ # we order them to facilitate searching for duplicates
65
+ end_a = OracleRelationEnd.new values[0], values[1].to_i, values[2].to_i, values[3]
66
+ end_b = OracleRelationEnd.new values[4], values[5].to_i, values[6].to_i, values[7]
67
+ if end_b.file < end_a.file
68
+ end_a, end_b = end_b, end_a
69
+ end
70
+
71
+ file_a = values[0]
72
+ line_a = values[1].to_i
73
+ col_a = values[2].to_i
74
+ surface_form_a = values[3]
75
+ file_b = values[4]
76
+ line_b = values[5].to_i
77
+ col_b = values[6].to_i
78
+ surface_form_b = values[7]
79
+ #if values[8]=='t'
80
+ # result = true
81
+ #elsif values[8]=='f'
82
+ # result = false
83
+ #else
84
+ # raise "Exptected true or false"
85
+ #end
86
+
87
+ #if oracle_values.values.include?([end_a,end_b])
88
+ # raise "Line #{l+1} is a duplicate of line #{oracle_values.find {|k,v| v==[end_a,end_b]}}"
89
+ #else
90
+ # oracle_values[l] = [end_a,end_b]
91
+ #end
92
+
93
+ file_a = "#{srcpath}/#{file_a}"
94
+ file_b = "#{srcpath}/#{file_b}"
95
+
96
+ model_a = project.models[file_a]
97
+ model_b = project.models[file_b]
98
+
99
+ raise "Model not found for #{file_a}. Available: #{project.models.keys}" unless model_a
100
+ raise "Model not found for #{file_b}. Available: #{project.models.keys}" unless model_b
101
+
102
+ plain_col_a = convert_from_tabcolumn_to_plaincolumn(file_a,line_a,col_a)
103
+ plain_col_b = convert_from_tabcolumn_to_plaincolumn(file_b,line_b,col_b)
104
+
105
+ pos_a = SourcePosition.new(SourcePoint.new(line_a,plain_col_a),SourcePoint.new(line_a,plain_col_a+surface_form_a.length-1))
106
+ pos_b = SourcePosition.new(SourcePoint.new(line_b,plain_col_b),SourcePoint.new(line_b,plain_col_b+surface_form_b.length-1))
107
+ begin
108
+ node_a = find_node(model_a,surface_form_a,pos_a)
109
+ ok_a+=1
110
+ rescue Exception => e
111
+ ko_a+=1
112
+ puts "Line #{l+1}) problem with '#{surface_form_a}', file: #{file_a}, pos #{pos_a}: #{e}"
113
+ end
114
+ begin
115
+ node_b = find_node(model_b,surface_form_b,pos_b)
116
+ ok_b+=1
117
+ rescue Exception => e
118
+ ko_b+=1
119
+ puts "Line #{l+1}) problem with '#{surface_form_b}', file: #{file_b}, pos #{pos_b}: #{e}"
120
+ end
121
+
122
+ if node_a and node_b
123
+ trindex_a = traverse_index(node_a)
124
+ trindex_b = traverse_index(node_b)
125
+
126
+ metaoracle_end_a = MetaOracleRelationEnd.new file_a,trindex_a
127
+ metaoracle_end_b = MetaOracleRelationEnd.new file_b,trindex_b
128
+ if metaoracle_end_b.file < metaoracle_end_a.file
129
+ metaoracle_end_a, metaoracle_end_b = metaoracle_end_b, metaoracle_end_a
130
+ end
131
+ #if metaoracle_values.values.include?([metaoracle_end_a,metaoracle_end_b])
132
+ # raise "Line #{l+1} (#{[metaoracle_end_a,metaoracle_end_b]}) is a duplicate of line #{metaoracle_values.find {|k,v| v==[metaoracle_end_a,metaoracle_end_b]}}"
133
+ #else
134
+ # metaoracle_values[l+1] = [metaoracle_end_a,metaoracle_end_b]
135
+ #end
136
+
137
+ id_a = NodeId.from_node(node_a)
138
+ id_b = NodeId.from_node(node_b)
139
+ rel = CrossLanguageRelation.new([id_a,id_b])
140
+ f = features[rel]
141
+ raise "Unknown features for #{rel} (a:#{node_a.source.artifact(:absolute).filename} L#{node_a.source.position(:absolute).begin_line},b:#{node_b.source.artifact(:absolute).filename} L#{node_b.source.position(:absolute).begin_line})" unless f
142
+ entry = { result: true }
143
+ f.each do |k,v|
144
+ entry[k] = v
145
+ end
146
+ train_data[rel] = entry
147
+ end
148
+ end
149
+ end
150
+
151
+ # all the others are implicitly negative examples
152
+ project.iter_over_shared_ids_instances do |node_a,node_b|
153
+ id_a = NodeId.from_node(node_a)
154
+ id_b = NodeId.from_node(node_b)
155
+ rel = CrossLanguageRelation.new([id_a,id_b])
156
+ unless train_data.has_key?(rel)
157
+ f = features[rel]
158
+ entry = { result: false }
159
+ f.each do |k,v|
160
+ entry[k] = v
161
+ end
162
+ train_data[rel] = entry
163
+ end
164
+ end
165
+
166
+ pos = 0
167
+ neg = 0
168
+ train_data.each do |k,v|
169
+ if v[:result]
170
+ pos+=1
171
+ #puts v
172
+ else
173
+ neg+=1
174
+ end
175
+ end
176
+ return train_data
177
+ end
178
+
179
+ private
180
+
181
+ def candidates_included_in_all_the_others(candidates_in_correct_position)
182
+ candidates_in_correct_position.each do |small|
183
+ ok = true
184
+ candidates_in_correct_position.each do |big|
185
+ if small!=big
186
+ unless big.source.position.include?(small.source.position)
187
+ ok = false
188
+ end
189
+ end
190
+ end
191
+ return small if ok
192
+ end
193
+ nil
194
+ end
195
+
196
+ def verbose_msg(msg)
197
+ end
198
+
199
+ def find_node(model,surface_form,position)
200
+ verbose_msg "Looking for '#{surface_form}'"
201
+ candidates_in_correct_position = []
202
+ candidates_in_other_positions = []
203
+ max_embedding_level = -1
204
+ model.traverse(:also_foreign) do |n|
205
+ if n.collect_values_with_count.has_key?(surface_form)
206
+ if n.source.position(:absolute).include?(position)
207
+ if n.source.embedding_level>=max_embedding_level
208
+ if n.source.embedding_level>max_embedding_level
209
+ candidates_in_correct_position.clear
210
+ end
211
+ max_embedding_level = n.source.embedding_level
212
+ candidates_in_correct_position << n
213
+ end
214
+ else
215
+ candidates_in_other_positions << n
216
+ end
217
+ end
218
+ end
219
+ if candidates_in_correct_position.count!=1
220
+ smallest_candidate = candidates_included_in_all_the_others(candidates_in_correct_position)
221
+ unless smallest_candidate
222
+ puts "I did not find exactly once '#{surface_form}' at #{position}. I found it there #{candidates_in_correct_position.count} times (found elsewhere #{candidates_in_other_positions.count} times)"
223
+
224
+
225
+ candidates_in_other_positions.each do |wp|
226
+ puts " * #{wp.source.position(:absolute)}"
227
+ end
228
+ puts "Candidate in corresponding position:"
229
+ candidates_in_correct_position.each do |c|
230
+ puts " * #{c} (embedded? #{c.source.embedded?})"
231
+ end
232
+ raise "Candidates found in #{position} are #{candidates_in_correct_position.count}"
233
+ else
234
+ puts "More than one candidate, I pick up the smallest"
235
+ return smallest_candidate
236
+ end
237
+ end
238
+ candidates_in_correct_position[0]
239
+ end
240
+
241
+ # the given column is calculated counting 4 for each tab,
242
+ # while the output count just 1 also per tab
243
+ def convert_from_tabcolumn_to_plaincolumn(file,line_index,tabcol)
244
+ line = @file_lines[file][line_index-1]
245
+ tabcol_to_plaincol(line,tabcol)
246
+ end
247
+
248
+ def tabcol_to_plaincol(line,tabcol)
249
+ c = 0
250
+ i = 0
251
+ while c<tabcol
252
+ c+=((line[i]=="\t") ? 4 : 1)
253
+ i+=1
254
+ end
255
+ raise "error" unless c==tabcol
256
+ i
257
+ end
258
+
259
+ end
260
+
261
+ end
@@ -0,0 +1,88 @@
1
+ # encoding: utf-8
2
+
3
+ require "codemodels"
4
+ require "codemodels/html"
5
+ require "codemodels/js"
6
+ require 'htmlentities'
7
+ require 'liquid'
8
+
9
+ module CrossLanguageSpotter
10
+
11
+ def _language_from_filename(filename)
12
+ if filename.end_with?('.html')
13
+ 'html'
14
+ else
15
+ 'javascript'
16
+ end
17
+ end
18
+
19
+ def generate_report_file(relations,output)
20
+ files_content = Hash.new{|h,k| h[k]=File.readlines(k)}
21
+ template = Liquid::Template.parse(File.read('./resources/template.html'))
22
+
23
+ data = []
24
+ relations.each do |rel|
25
+ entry = {}
26
+ entry['filenameA'] = rel[:node_a_file]
27
+ entry['languageA'] = _language_from_filename(entry['filenameA'])
28
+ entry['srcfileA'] = _code(files_content,rel[:node_a_file],
29
+ rel[:node_a_begin_line]-1,rel[:node_a_end_line]-1,
30
+ rel[:node_a_begin_column],rel[:node_a_end_column])
31
+ entry['filenameB'] = rel[:node_b_file]
32
+ entry['languageB'] = _language_from_filename(entry['filenameB'])
33
+ entry['srcfileB'] = _code(files_content,rel[:node_b_file],
34
+ rel[:node_b_begin_line]-1,rel[:node_b_end_line]-1,
35
+ rel[:node_b_begin_column],rel[:node_b_end_column])
36
+ data << entry
37
+ end
38
+
39
+ File.open(output, 'w') {|f| f.write(template.render({"relations"=>data})) }
40
+ end
41
+
42
+ def _code(files_content,filename,begin_line,end_line,begin_col,end_col)
43
+ code = ""
44
+ snippet_lines = _get_snippet_lines(files_content[filename],begin_line)
45
+ snippet_lines[:before].each do |l|
46
+ code += HTMLEntities.new.encode(l,:decimal)
47
+ end
48
+ snippet_lines[:lines].each do |l|
49
+ #l = l.gsub("\t",' ')
50
+ code += HTMLEntities.new.encode(l[0...(begin_col-1)],:decimal)
51
+ puts "<<<#{l[(begin_col-1)...end_col]}>>>"
52
+ code += '<span style="background-color:yellow;padding:2px">'+HTMLEntities.new.encode(l[(begin_col-1)...end_col],:decimal)+"</span>"
53
+ code += HTMLEntities.new.encode(l[end_col..-1],:decimal)
54
+ end
55
+ snippet_lines[:after].each do |l|
56
+ code += HTMLEntities.new.encode(l,:decimal)
57
+ end
58
+ code = _remove_extra_spaces(code)
59
+ code
60
+ end
61
+
62
+ def _get_snippet_lines(lines,line_index)
63
+ around = 5
64
+ start_line = [0,line_index-5].max
65
+ end_line = [lines.count-1,line_index+5].min
66
+ before = lines[start_line...line_index]
67
+ sel_lines = [lines[line_index]]
68
+ after = lines[(line_index+1)..(end_line)]
69
+ {before:before,lines:sel_lines,after:after}
70
+ end
71
+
72
+ def _number_of_spaces(s)
73
+ return 0 unless s.start_with?(' ')
74
+ 1+_number_of_spaces(s[1..-1])
75
+ end
76
+
77
+ def _remove_extra_spaces(code,newline="&#10;")
78
+ lines = code.split(newline)
79
+ spaces = []
80
+ lines.each do |l|
81
+ spaces << _number_of_spaces(l)
82
+ end
83
+ extra_spaces = spaces.min
84
+ lines.each_with_index {|l,i| lines[i] = l[extra_spaces..-1]}
85
+ lines.join(newline)
86
+ end
87
+
88
+ end
@@ -0,0 +1,5 @@
1
+ # encoding: utf-8
2
+
3
+ module CrossLanguageSpotter
4
+ VERSION = "0.0.2"
5
+ end
@@ -0,0 +1,83 @@
1
+ require 'java'
2
+
3
+ module CrossLanguageSpotter
4
+
5
+ def build_classifier(training_instances)
6
+ c = Java::weka::classifiers::trees::RandomTree.new
7
+ c.build_classifier(training_instances)
8
+ c
9
+ end
10
+
11
+ class WekaClassifier
12
+
13
+ def initialize(training_instances)
14
+ @weka_classifier = build_classifier(training_instances)
15
+ end
16
+
17
+ def classify(data_instances)
18
+ results = []
19
+ data_instances.enumerate_instances.each do |instance|
20
+ #puts "Classifying #{instance}"
21
+ r = @weka_classifier.classify_instance(instance)
22
+ #puts "Result: #{r} #{instance}"
23
+ results.push({result: r==0.0, instance: instance})
24
+ end
25
+ return results
26
+ end
27
+
28
+ end
29
+
30
+ def hash2weka_instances(name,data,keys,class_value)
31
+ boolean_values = Java::weka::core::FastVector.new
32
+ boolean_values.add_element("true")
33
+ boolean_values.add_element("false")
34
+
35
+ # fill attributes
36
+ attributes = Java::weka::core::FastVector.new
37
+ attributes_map = {}
38
+ attributes_indexes = {}
39
+ i = 0
40
+ keys.each do |k,v|
41
+ raise "Null key in keys: #{keys}" unless k
42
+ raise "Null value for key #{k} in keys: #{keys}" unless v!=nil
43
+ if v==:numeric
44
+ # creates a numeric attribute
45
+ a = Java::weka::core::Attribute.new(k.to_s)
46
+ elsif v==:boolean
47
+ a = Java::weka::core::Attribute.new(k.to_s,boolean_values)
48
+ else
49
+ raise "Unknown attribute type: #{v}"
50
+ end
51
+ attributes.add_element(a)
52
+ attributes_map[k] = a
53
+ attributes_indexes[k] = i
54
+ i+=1
55
+ end
56
+ instances = Java::weka::core::Instances.new name, attributes, data.count
57
+
58
+ # fill instances
59
+ data.each do |row|
60
+ instance = Java::weka::core::Instance.new keys.count
61
+ keys.each do |k,v|
62
+ a = attributes_map[k]
63
+ if v==:numeric
64
+ instance.setValue(a,row[k])
65
+ elsif v==:boolean
66
+ instance.setValue(a,row[k].to_s)
67
+ else
68
+ raise "Unknown attribute type: #{v}"
69
+ end
70
+ end
71
+ instances.add(instance)
72
+ end
73
+
74
+ if class_value
75
+ instances.setClassIndex(attributes_indexes[class_value])
76
+ end
77
+
78
+ #puts instances.to_s
79
+
80
+ return instances
81
+ end
82
+
83
+ end
@@ -0,0 +1,7 @@
1
+ # encoding: utf-8
2
+
3
+ require 'jars/weka.jar'
4
+
5
+ curr_dir = File.dirname(__FILE__)
6
+
7
+ Dir["#{curr_dir}/crosslanguagespotter/*.rb"].each { |rb| require rb }
data/lib/jars/weka.jar ADDED
Binary file