crosslanguagespotter 0.0.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/Gemfile +3 -0
  4. data/Rakefile +13 -0
  5. data/crosslanguagespotter.gemspec +36 -0
  6. data/examples/ex1.rb +13 -0
  7. data/examples/services_example.rb +13 -0
  8. data/lib/crosslanguagespotter/basic.rb +157 -0
  9. data/lib/crosslanguagespotter/context.rb +139 -0
  10. data/lib/crosslanguagespotter/figures_evaluator.rb +160 -0
  11. data/lib/crosslanguagespotter/jaccard.rb +114 -0
  12. data/lib/crosslanguagespotter/methods/context.rb +127 -0
  13. data/lib/crosslanguagespotter/methods/jaro.rb +118 -0
  14. data/lib/crosslanguagespotter/methods/tversky.rb +44 -0
  15. data/lib/crosslanguagespotter/model_loading.rb +333 -0
  16. data/lib/crosslanguagespotter/oracle.rb +261 -0
  17. data/lib/crosslanguagespotter/report.rb +88 -0
  18. data/lib/crosslanguagespotter/version.rb +5 -0
  19. data/lib/crosslanguagespotter/wekaintegration.rb +83 -0
  20. data/lib/crosslanguagespotter.rb +7 -0
  21. data/lib/jars/weka.jar +0 -0
  22. data/resources/css/bootstrap-theme.css +346 -0
  23. data/resources/css/bootstrap-theme.min.css +7 -0
  24. data/resources/css/bootstrap.css +5780 -0
  25. data/resources/css/bootstrap.min.css +7 -0
  26. data/resources/css/highlightstyles/arta.css +160 -0
  27. data/resources/css/highlightstyles/ascetic.css +50 -0
  28. data/resources/css/highlightstyles/atelier-dune.dark.css +93 -0
  29. data/resources/css/highlightstyles/atelier-dune.light.css +93 -0
  30. data/resources/css/highlightstyles/atelier-forest.dark.css +93 -0
  31. data/resources/css/highlightstyles/atelier-forest.light.css +93 -0
  32. data/resources/css/highlightstyles/atelier-heath.dark.css +93 -0
  33. data/resources/css/highlightstyles/atelier-heath.light.css +93 -0
  34. data/resources/css/highlightstyles/atelier-lakeside.dark.css +93 -0
  35. data/resources/css/highlightstyles/atelier-lakeside.light.css +93 -0
  36. data/resources/css/highlightstyles/atelier-seaside.dark.css +93 -0
  37. data/resources/css/highlightstyles/atelier-seaside.light.css +93 -0
  38. data/resources/css/highlightstyles/brown_paper.css +105 -0
  39. data/resources/css/highlightstyles/brown_papersq.png +0 -0
  40. data/resources/css/highlightstyles/dark.css +105 -0
  41. data/resources/css/highlightstyles/default.css +153 -0
  42. data/resources/css/highlightstyles/docco.css +132 -0
  43. data/resources/css/highlightstyles/far.css +113 -0
  44. data/resources/css/highlightstyles/foundation.css +133 -0
  45. data/resources/css/highlightstyles/github.css +125 -0
  46. data/resources/css/highlightstyles/googlecode.css +147 -0
  47. data/resources/css/highlightstyles/idea.css +122 -0
  48. data/resources/css/highlightstyles/ir_black.css +105 -0
  49. data/resources/css/highlightstyles/magula.css +123 -0
  50. data/resources/css/highlightstyles/mono-blue.css +62 -0
  51. data/resources/css/highlightstyles/monokai.css +127 -0
  52. data/resources/css/highlightstyles/monokai_sublime.css +149 -0
  53. data/resources/css/highlightstyles/obsidian.css +154 -0
  54. data/resources/css/highlightstyles/paraiso.dark.css +93 -0
  55. data/resources/css/highlightstyles/paraiso.light.css +93 -0
  56. data/resources/css/highlightstyles/pojoaque.css +106 -0
  57. data/resources/css/highlightstyles/pojoaque.jpg +0 -0
  58. data/resources/css/highlightstyles/railscasts.css +182 -0
  59. data/resources/css/highlightstyles/rainbow.css +112 -0
  60. data/resources/css/highlightstyles/school_book.css +113 -0
  61. data/resources/css/highlightstyles/school_book.png +0 -0
  62. data/resources/css/highlightstyles/solarized_dark.css +107 -0
  63. data/resources/css/highlightstyles/solarized_light.css +107 -0
  64. data/resources/css/highlightstyles/sunburst.css +160 -0
  65. data/resources/css/highlightstyles/tomorrow-night-blue.css +93 -0
  66. data/resources/css/highlightstyles/tomorrow-night-bright.css +92 -0
  67. data/resources/css/highlightstyles/tomorrow-night-eighties.css +92 -0
  68. data/resources/css/highlightstyles/tomorrow-night.css +93 -0
  69. data/resources/css/highlightstyles/tomorrow.css +90 -0
  70. data/resources/css/highlightstyles/vs.css +89 -0
  71. data/resources/css/highlightstyles/xcode.css +158 -0
  72. data/resources/css/highlightstyles/zenburn.css +117 -0
  73. data/resources/example.html +1501 -0
  74. data/resources/js/bootstrap.js +1943 -0
  75. data/resources/js/bootstrap.min.js +7 -0
  76. data/resources/js/highlight.pack.js +1 -0
  77. data/resources/services_example.html +141 -0
  78. data/resources/template.html +61 -0
  79. data/test/data/angular-puzzle.GS +111 -0
  80. data/test/data/angular_puzzle/app.js +66 -0
  81. data/test/data/angular_puzzle/index.html +67 -0
  82. data/test/data/angular_puzzle/slidingPuzzle.js +203 -0
  83. data/test/data/angular_puzzle/wordSearchPuzzle.js +270 -0
  84. data/test/data/example.html +5 -0
  85. data/test/data/example.js +4 -0
  86. data/test/data/services/index.html +33 -0
  87. data/test/data/services/script.js +15 -0
  88. data/test/test_helper.rb +9 -0
  89. data/test/test_parsing.rb +23 -0
  90. data/test/test_spotter.rb +42 -0
  91. data/test/test_wekaintegration.rb +43 -0
  92. metadata +328 -0
@@ -0,0 +1,261 @@
1
+ require 'codemodels'
2
+ require 'codemodels/js'
3
+ require 'codemodels/html'
4
+ require 'csv'
5
+ require 'crosslanguagespotter/model_loading'
6
+ #require 'console'
7
+ #require 'code_processing'
8
+
9
+ include CodeModels
10
+
11
+ module CrossLanguageSpotter
12
+
13
+ OracleRelationEnd = Struct.new :file, :line, :col, :surface_form
14
+ MetaOracleRelationEnd = Struct.new :file, :index
15
+
16
+ class OracleLoader
17
+
18
+ def build_weka_classifier(srcpath,oraclepath)
19
+ features_data = to_train_data(srcpath,oraclepath)
20
+ data = []
21
+ features_data.each do |rel,row|
22
+ data.push(row)
23
+ end
24
+ keys = {
25
+ shared_length: :numeric,
26
+ tfidf_shared: :numeric,
27
+ itfidf_shared: :numeric,
28
+ perc_shared_length_min: :numeric,
29
+ perc_shared_length_max: :numeric,
30
+ diff_min: :numeric,
31
+ diff_max: :numeric,
32
+ perc_diff_min: :numeric,
33
+ perc_diff_max: :numeric,
34
+ context: :numeric,
35
+ jaccard: :numeric,
36
+ jaro: :numeric,
37
+ tversky: :numeric,
38
+ result: :boolean
39
+ }
40
+ train_instances = hash2weka_instances("oracle",data,keys,:result)
41
+ WekaClassifier.new(train_instances)
42
+ end
43
+
44
+ def to_train_data(srcpath,oraclepath)
45
+ project = Project.new(srcpath)
46
+
47
+ spotter = Spotter.new
48
+ features = spotter.features_for_project(project)
49
+
50
+ @file_lines = Hash.new do |h,k|
51
+ h[k] = File.readlines(k)
52
+ end
53
+
54
+ ok_a = ok_b = ko_a = ko_b = 0
55
+
56
+ train_data = {}
57
+ File.open(oraclepath,'r').each_with_index do |input_line,l|
58
+ input_line.strip!
59
+ unless input_line.start_with?('#')
60
+ values = input_line.split ":"
61
+ if values.count!=8
62
+ raise "Line #{l+1}, error: #{input_line}. Values: #{values}"
63
+ end
64
+ # we order them to facilitate searching for duplicates
65
+ end_a = OracleRelationEnd.new values[0], values[1].to_i, values[2].to_i, values[3]
66
+ end_b = OracleRelationEnd.new values[4], values[5].to_i, values[6].to_i, values[7]
67
+ if end_b.file < end_a.file
68
+ end_a, end_b = end_b, end_a
69
+ end
70
+
71
+ file_a = values[0]
72
+ line_a = values[1].to_i
73
+ col_a = values[2].to_i
74
+ surface_form_a = values[3]
75
+ file_b = values[4]
76
+ line_b = values[5].to_i
77
+ col_b = values[6].to_i
78
+ surface_form_b = values[7]
79
+ #if values[8]=='t'
80
+ # result = true
81
+ #elsif values[8]=='f'
82
+ # result = false
83
+ #else
84
+ # raise "Exptected true or false"
85
+ #end
86
+
87
+ #if oracle_values.values.include?([end_a,end_b])
88
+ # raise "Line #{l+1} is a duplicate of line #{oracle_values.find {|k,v| v==[end_a,end_b]}}"
89
+ #else
90
+ # oracle_values[l] = [end_a,end_b]
91
+ #end
92
+
93
+ file_a = "#{srcpath}/#{file_a}"
94
+ file_b = "#{srcpath}/#{file_b}"
95
+
96
+ model_a = project.models[file_a]
97
+ model_b = project.models[file_b]
98
+
99
+ raise "Model not found for #{file_a}. Available: #{project.models.keys}" unless model_a
100
+ raise "Model not found for #{file_b}. Available: #{project.models.keys}" unless model_b
101
+
102
+ plain_col_a = convert_from_tabcolumn_to_plaincolumn(file_a,line_a,col_a)
103
+ plain_col_b = convert_from_tabcolumn_to_plaincolumn(file_b,line_b,col_b)
104
+
105
+ pos_a = SourcePosition.new(SourcePoint.new(line_a,plain_col_a),SourcePoint.new(line_a,plain_col_a+surface_form_a.length-1))
106
+ pos_b = SourcePosition.new(SourcePoint.new(line_b,plain_col_b),SourcePoint.new(line_b,plain_col_b+surface_form_b.length-1))
107
+ begin
108
+ node_a = find_node(model_a,surface_form_a,pos_a)
109
+ ok_a+=1
110
+ rescue Exception => e
111
+ ko_a+=1
112
+ puts "Line #{l+1}) problem with '#{surface_form_a}', file: #{file_a}, pos #{pos_a}: #{e}"
113
+ end
114
+ begin
115
+ node_b = find_node(model_b,surface_form_b,pos_b)
116
+ ok_b+=1
117
+ rescue Exception => e
118
+ ko_b+=1
119
+ puts "Line #{l+1}) problem with '#{surface_form_b}', file: #{file_b}, pos #{pos_b}: #{e}"
120
+ end
121
+
122
+ if node_a and node_b
123
+ trindex_a = traverse_index(node_a)
124
+ trindex_b = traverse_index(node_b)
125
+
126
+ metaoracle_end_a = MetaOracleRelationEnd.new file_a,trindex_a
127
+ metaoracle_end_b = MetaOracleRelationEnd.new file_b,trindex_b
128
+ if metaoracle_end_b.file < metaoracle_end_a.file
129
+ metaoracle_end_a, metaoracle_end_b = metaoracle_end_b, metaoracle_end_a
130
+ end
131
+ #if metaoracle_values.values.include?([metaoracle_end_a,metaoracle_end_b])
132
+ # raise "Line #{l+1} (#{[metaoracle_end_a,metaoracle_end_b]}) is a duplicate of line #{metaoracle_values.find {|k,v| v==[metaoracle_end_a,metaoracle_end_b]}}"
133
+ #else
134
+ # metaoracle_values[l+1] = [metaoracle_end_a,metaoracle_end_b]
135
+ #end
136
+
137
+ id_a = NodeId.from_node(node_a)
138
+ id_b = NodeId.from_node(node_b)
139
+ rel = CrossLanguageRelation.new([id_a,id_b])
140
+ f = features[rel]
141
+ raise "Unknown features for #{rel} (a:#{node_a.source.artifact(:absolute).filename} L#{node_a.source.position(:absolute).begin_line},b:#{node_b.source.artifact(:absolute).filename} L#{node_b.source.position(:absolute).begin_line})" unless f
142
+ entry = { result: true }
143
+ f.each do |k,v|
144
+ entry[k] = v
145
+ end
146
+ train_data[rel] = entry
147
+ end
148
+ end
149
+ end
150
+
151
+ # all the others are implicitly negative examples
152
+ project.iter_over_shared_ids_instances do |node_a,node_b|
153
+ id_a = NodeId.from_node(node_a)
154
+ id_b = NodeId.from_node(node_b)
155
+ rel = CrossLanguageRelation.new([id_a,id_b])
156
+ unless train_data.has_key?(rel)
157
+ f = features[rel]
158
+ entry = { result: false }
159
+ f.each do |k,v|
160
+ entry[k] = v
161
+ end
162
+ train_data[rel] = entry
163
+ end
164
+ end
165
+
166
+ pos = 0
167
+ neg = 0
168
+ train_data.each do |k,v|
169
+ if v[:result]
170
+ pos+=1
171
+ #puts v
172
+ else
173
+ neg+=1
174
+ end
175
+ end
176
+ return train_data
177
+ end
178
+
179
+ private
180
+
181
+ def candidates_included_in_all_the_others(candidates_in_correct_position)
182
+ candidates_in_correct_position.each do |small|
183
+ ok = true
184
+ candidates_in_correct_position.each do |big|
185
+ if small!=big
186
+ unless big.source.position.include?(small.source.position)
187
+ ok = false
188
+ end
189
+ end
190
+ end
191
+ return small if ok
192
+ end
193
+ nil
194
+ end
195
+
196
+ def verbose_msg(msg)
197
+ end
198
+
199
+ def find_node(model,surface_form,position)
200
+ verbose_msg "Looking for '#{surface_form}'"
201
+ candidates_in_correct_position = []
202
+ candidates_in_other_positions = []
203
+ max_embedding_level = -1
204
+ model.traverse(:also_foreign) do |n|
205
+ if n.collect_values_with_count.has_key?(surface_form)
206
+ if n.source.position(:absolute).include?(position)
207
+ if n.source.embedding_level>=max_embedding_level
208
+ if n.source.embedding_level>max_embedding_level
209
+ candidates_in_correct_position.clear
210
+ end
211
+ max_embedding_level = n.source.embedding_level
212
+ candidates_in_correct_position << n
213
+ end
214
+ else
215
+ candidates_in_other_positions << n
216
+ end
217
+ end
218
+ end
219
+ if candidates_in_correct_position.count!=1
220
+ smallest_candidate = candidates_included_in_all_the_others(candidates_in_correct_position)
221
+ unless smallest_candidate
222
+ puts "I did not find exactly once '#{surface_form}' at #{position}. I found it there #{candidates_in_correct_position.count} times (found elsewhere #{candidates_in_other_positions.count} times)"
223
+
224
+
225
+ candidates_in_other_positions.each do |wp|
226
+ puts " * #{wp.source.position(:absolute)}"
227
+ end
228
+ puts "Candidate in corresponding position:"
229
+ candidates_in_correct_position.each do |c|
230
+ puts " * #{c} (embedded? #{c.source.embedded?})"
231
+ end
232
+ raise "Candidates found in #{position} are #{candidates_in_correct_position.count}"
233
+ else
234
+ puts "More than one candidate, I pick up the smallest"
235
+ return smallest_candidate
236
+ end
237
+ end
238
+ candidates_in_correct_position[0]
239
+ end
240
+
241
+ # the given column is calculated counting 4 for each tab,
242
+ # while the output count just 1 also per tab
243
+ def convert_from_tabcolumn_to_plaincolumn(file,line_index,tabcol)
244
+ line = @file_lines[file][line_index-1]
245
+ tabcol_to_plaincol(line,tabcol)
246
+ end
247
+
248
+ def tabcol_to_plaincol(line,tabcol)
249
+ c = 0
250
+ i = 0
251
+ while c<tabcol
252
+ c+=((line[i]=="\t") ? 4 : 1)
253
+ i+=1
254
+ end
255
+ raise "error" unless c==tabcol
256
+ i
257
+ end
258
+
259
+ end
260
+
261
+ end
@@ -0,0 +1,88 @@
1
+ # encoding: utf-8
2
+
3
+ require "codemodels"
4
+ require "codemodels/html"
5
+ require "codemodels/js"
6
+ require 'htmlentities'
7
+ require 'liquid'
8
+
9
+ module CrossLanguageSpotter
10
+
11
+ def _language_from_filename(filename)
12
+ if filename.end_with?('.html')
13
+ 'html'
14
+ else
15
+ 'javascript'
16
+ end
17
+ end
18
+
19
+ def generate_report_file(relations,output)
20
+ files_content = Hash.new{|h,k| h[k]=File.readlines(k)}
21
+ template = Liquid::Template.parse(File.read('./resources/template.html'))
22
+
23
+ data = []
24
+ relations.each do |rel|
25
+ entry = {}
26
+ entry['filenameA'] = rel[:node_a_file]
27
+ entry['languageA'] = _language_from_filename(entry['filenameA'])
28
+ entry['srcfileA'] = _code(files_content,rel[:node_a_file],
29
+ rel[:node_a_begin_line]-1,rel[:node_a_end_line]-1,
30
+ rel[:node_a_begin_column],rel[:node_a_end_column])
31
+ entry['filenameB'] = rel[:node_b_file]
32
+ entry['languageB'] = _language_from_filename(entry['filenameB'])
33
+ entry['srcfileB'] = _code(files_content,rel[:node_b_file],
34
+ rel[:node_b_begin_line]-1,rel[:node_b_end_line]-1,
35
+ rel[:node_b_begin_column],rel[:node_b_end_column])
36
+ data << entry
37
+ end
38
+
39
+ File.open(output, 'w') {|f| f.write(template.render({"relations"=>data})) }
40
+ end
41
+
42
+ def _code(files_content,filename,begin_line,end_line,begin_col,end_col)
43
+ code = ""
44
+ snippet_lines = _get_snippet_lines(files_content[filename],begin_line)
45
+ snippet_lines[:before].each do |l|
46
+ code += HTMLEntities.new.encode(l,:decimal)
47
+ end
48
+ snippet_lines[:lines].each do |l|
49
+ #l = l.gsub("\t",' ')
50
+ code += HTMLEntities.new.encode(l[0...(begin_col-1)],:decimal)
51
+ puts "<<<#{l[(begin_col-1)...end_col]}>>>"
52
+ code += '<span style="background-color:yellow;padding:2px">'+HTMLEntities.new.encode(l[(begin_col-1)...end_col],:decimal)+"</span>"
53
+ code += HTMLEntities.new.encode(l[end_col..-1],:decimal)
54
+ end
55
+ snippet_lines[:after].each do |l|
56
+ code += HTMLEntities.new.encode(l,:decimal)
57
+ end
58
+ code = _remove_extra_spaces(code)
59
+ code
60
+ end
61
+
62
+ def _get_snippet_lines(lines,line_index)
63
+ around = 5
64
+ start_line = [0,line_index-5].max
65
+ end_line = [lines.count-1,line_index+5].min
66
+ before = lines[start_line...line_index]
67
+ sel_lines = [lines[line_index]]
68
+ after = lines[(line_index+1)..(end_line)]
69
+ {before:before,lines:sel_lines,after:after}
70
+ end
71
+
72
+ def _number_of_spaces(s)
73
+ return 0 unless s.start_with?(' ')
74
+ 1+_number_of_spaces(s[1..-1])
75
+ end
76
+
77
+ def _remove_extra_spaces(code,newline="&#10;")
78
+ lines = code.split(newline)
79
+ spaces = []
80
+ lines.each do |l|
81
+ spaces << _number_of_spaces(l)
82
+ end
83
+ extra_spaces = spaces.min
84
+ lines.each_with_index {|l,i| lines[i] = l[extra_spaces..-1]}
85
+ lines.join(newline)
86
+ end
87
+
88
+ end
@@ -0,0 +1,5 @@
1
+ # encoding: utf-8
2
+
3
+ module CrossLanguageSpotter
4
+ VERSION = "0.0.2"
5
+ end
@@ -0,0 +1,83 @@
1
+ require 'java'
2
+
3
+ module CrossLanguageSpotter
4
+
5
+ def build_classifier(training_instances)
6
+ c = Java::weka::classifiers::trees::RandomTree.new
7
+ c.build_classifier(training_instances)
8
+ c
9
+ end
10
+
11
+ class WekaClassifier
12
+
13
+ def initialize(training_instances)
14
+ @weka_classifier = build_classifier(training_instances)
15
+ end
16
+
17
+ def classify(data_instances)
18
+ results = []
19
+ data_instances.enumerate_instances.each do |instance|
20
+ #puts "Classifying #{instance}"
21
+ r = @weka_classifier.classify_instance(instance)
22
+ #puts "Result: #{r} #{instance}"
23
+ results.push({result: r==0.0, instance: instance})
24
+ end
25
+ return results
26
+ end
27
+
28
+ end
29
+
30
+ def hash2weka_instances(name,data,keys,class_value)
31
+ boolean_values = Java::weka::core::FastVector.new
32
+ boolean_values.add_element("true")
33
+ boolean_values.add_element("false")
34
+
35
+ # fill attributes
36
+ attributes = Java::weka::core::FastVector.new
37
+ attributes_map = {}
38
+ attributes_indexes = {}
39
+ i = 0
40
+ keys.each do |k,v|
41
+ raise "Null key in keys: #{keys}" unless k
42
+ raise "Null value for key #{k} in keys: #{keys}" unless v!=nil
43
+ if v==:numeric
44
+ # creates a numeric attribute
45
+ a = Java::weka::core::Attribute.new(k.to_s)
46
+ elsif v==:boolean
47
+ a = Java::weka::core::Attribute.new(k.to_s,boolean_values)
48
+ else
49
+ raise "Unknown attribute type: #{v}"
50
+ end
51
+ attributes.add_element(a)
52
+ attributes_map[k] = a
53
+ attributes_indexes[k] = i
54
+ i+=1
55
+ end
56
+ instances = Java::weka::core::Instances.new name, attributes, data.count
57
+
58
+ # fill instances
59
+ data.each do |row|
60
+ instance = Java::weka::core::Instance.new keys.count
61
+ keys.each do |k,v|
62
+ a = attributes_map[k]
63
+ if v==:numeric
64
+ instance.setValue(a,row[k])
65
+ elsif v==:boolean
66
+ instance.setValue(a,row[k].to_s)
67
+ else
68
+ raise "Unknown attribute type: #{v}"
69
+ end
70
+ end
71
+ instances.add(instance)
72
+ end
73
+
74
+ if class_value
75
+ instances.setClassIndex(attributes_indexes[class_value])
76
+ end
77
+
78
+ #puts instances.to_s
79
+
80
+ return instances
81
+ end
82
+
83
+ end
@@ -0,0 +1,7 @@
1
+ # encoding: utf-8
2
+
3
+ require 'jars/weka.jar'
4
+
5
+ curr_dir = File.dirname(__FILE__)
6
+
7
+ Dir["#{curr_dir}/crosslanguagespotter/*.rb"].each { |rb| require rb }
data/lib/jars/weka.jar ADDED
Binary file