crosslanguagespotter 0.0.2-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/Gemfile +3 -0
- data/Rakefile +13 -0
- data/crosslanguagespotter.gemspec +36 -0
- data/examples/ex1.rb +13 -0
- data/examples/services_example.rb +13 -0
- data/lib/crosslanguagespotter/basic.rb +157 -0
- data/lib/crosslanguagespotter/context.rb +139 -0
- data/lib/crosslanguagespotter/figures_evaluator.rb +160 -0
- data/lib/crosslanguagespotter/jaccard.rb +114 -0
- data/lib/crosslanguagespotter/methods/context.rb +127 -0
- data/lib/crosslanguagespotter/methods/jaro.rb +118 -0
- data/lib/crosslanguagespotter/methods/tversky.rb +44 -0
- data/lib/crosslanguagespotter/model_loading.rb +333 -0
- data/lib/crosslanguagespotter/oracle.rb +261 -0
- data/lib/crosslanguagespotter/report.rb +88 -0
- data/lib/crosslanguagespotter/version.rb +5 -0
- data/lib/crosslanguagespotter/wekaintegration.rb +83 -0
- data/lib/crosslanguagespotter.rb +7 -0
- data/lib/jars/weka.jar +0 -0
- data/resources/css/bootstrap-theme.css +346 -0
- data/resources/css/bootstrap-theme.min.css +7 -0
- data/resources/css/bootstrap.css +5780 -0
- data/resources/css/bootstrap.min.css +7 -0
- data/resources/css/highlightstyles/arta.css +160 -0
- data/resources/css/highlightstyles/ascetic.css +50 -0
- data/resources/css/highlightstyles/atelier-dune.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-dune.light.css +93 -0
- data/resources/css/highlightstyles/atelier-forest.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-forest.light.css +93 -0
- data/resources/css/highlightstyles/atelier-heath.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-heath.light.css +93 -0
- data/resources/css/highlightstyles/atelier-lakeside.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-lakeside.light.css +93 -0
- data/resources/css/highlightstyles/atelier-seaside.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-seaside.light.css +93 -0
- data/resources/css/highlightstyles/brown_paper.css +105 -0
- data/resources/css/highlightstyles/brown_papersq.png +0 -0
- data/resources/css/highlightstyles/dark.css +105 -0
- data/resources/css/highlightstyles/default.css +153 -0
- data/resources/css/highlightstyles/docco.css +132 -0
- data/resources/css/highlightstyles/far.css +113 -0
- data/resources/css/highlightstyles/foundation.css +133 -0
- data/resources/css/highlightstyles/github.css +125 -0
- data/resources/css/highlightstyles/googlecode.css +147 -0
- data/resources/css/highlightstyles/idea.css +122 -0
- data/resources/css/highlightstyles/ir_black.css +105 -0
- data/resources/css/highlightstyles/magula.css +123 -0
- data/resources/css/highlightstyles/mono-blue.css +62 -0
- data/resources/css/highlightstyles/monokai.css +127 -0
- data/resources/css/highlightstyles/monokai_sublime.css +149 -0
- data/resources/css/highlightstyles/obsidian.css +154 -0
- data/resources/css/highlightstyles/paraiso.dark.css +93 -0
- data/resources/css/highlightstyles/paraiso.light.css +93 -0
- data/resources/css/highlightstyles/pojoaque.css +106 -0
- data/resources/css/highlightstyles/pojoaque.jpg +0 -0
- data/resources/css/highlightstyles/railscasts.css +182 -0
- data/resources/css/highlightstyles/rainbow.css +112 -0
- data/resources/css/highlightstyles/school_book.css +113 -0
- data/resources/css/highlightstyles/school_book.png +0 -0
- data/resources/css/highlightstyles/solarized_dark.css +107 -0
- data/resources/css/highlightstyles/solarized_light.css +107 -0
- data/resources/css/highlightstyles/sunburst.css +160 -0
- data/resources/css/highlightstyles/tomorrow-night-blue.css +93 -0
- data/resources/css/highlightstyles/tomorrow-night-bright.css +92 -0
- data/resources/css/highlightstyles/tomorrow-night-eighties.css +92 -0
- data/resources/css/highlightstyles/tomorrow-night.css +93 -0
- data/resources/css/highlightstyles/tomorrow.css +90 -0
- data/resources/css/highlightstyles/vs.css +89 -0
- data/resources/css/highlightstyles/xcode.css +158 -0
- data/resources/css/highlightstyles/zenburn.css +117 -0
- data/resources/example.html +1501 -0
- data/resources/js/bootstrap.js +1943 -0
- data/resources/js/bootstrap.min.js +7 -0
- data/resources/js/highlight.pack.js +1 -0
- data/resources/services_example.html +141 -0
- data/resources/template.html +61 -0
- data/test/data/angular-puzzle.GS +111 -0
- data/test/data/angular_puzzle/app.js +66 -0
- data/test/data/angular_puzzle/index.html +67 -0
- data/test/data/angular_puzzle/slidingPuzzle.js +203 -0
- data/test/data/angular_puzzle/wordSearchPuzzle.js +270 -0
- data/test/data/example.html +5 -0
- data/test/data/example.js +4 -0
- data/test/data/services/index.html +33 -0
- data/test/data/services/script.js +15 -0
- data/test/test_helper.rb +9 -0
- data/test/test_parsing.rb +23 -0
- data/test/test_spotter.rb +42 -0
- data/test/test_wekaintegration.rb +43 -0
- metadata +328 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 68ee2d427d4dacc3d22b88a1deababeb2a1bcafd
|
4
|
+
data.tar.gz: 65ab29b3ba450f824a4930bd0a436170efabff57
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ec2f763115ece653ad3a1379f24397bd73e18f3d4d101b222a922fead038bb4498459319595ac86444e253176b66ca9ebe93f117632293f879ee1bbf2f348cb8
|
7
|
+
data.tar.gz: d7b1571637ec59db6e9519e035c4826fecde7d121b2e8f03898317f4eaf3cc237bb9253833f4f90357fa604fb1c0b31cfb9b1b27b496218b3066aa3034d4fae8
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'crosslanguagespotter/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.platform = 'java'
|
8
|
+
s.name = 'crosslanguagespotter'
|
9
|
+
s.version = CrossLanguageSpotter::VERSION
|
10
|
+
s.summary = "Automatic Spotter of Cross-Language references"
|
11
|
+
s.description = "Automatic Spotter of Cross-Language references"
|
12
|
+
s.authors = ["Federico Tomassetti"]
|
13
|
+
s.email = 'f.tomassetti@gmail.com'
|
14
|
+
s.homepage = 'https://github.com/CrossLanguageProject/crosslanguagerelationsspotter'
|
15
|
+
s.license = "Apache v2"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split($/)
|
18
|
+
s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
19
|
+
s.test_files = s.files.grep(%r{^(test|spec|features)/})
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.add_dependency('codemodels')
|
23
|
+
s.add_dependency('codemodels-js')
|
24
|
+
s.add_dependency('codemodels-html')
|
25
|
+
s.add_dependency('codemodels-java')
|
26
|
+
s.add_dependency('codemodels-ruby')
|
27
|
+
s.add_dependency('codemodels-xml')
|
28
|
+
s.add_dependency('codemodels-properties')
|
29
|
+
s.add_dependency('htmlentities')
|
30
|
+
s.add_dependency('liquid')
|
31
|
+
|
32
|
+
s.add_development_dependency "bundler"
|
33
|
+
s.add_development_dependency "rake"
|
34
|
+
s.add_development_dependency "simplecov"
|
35
|
+
s.add_development_dependency "rubygems-tasks"
|
36
|
+
end
|
data/examples/ex1.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
$: << './lib'
|
2
|
+
require 'crosslanguagespotter'
|
3
|
+
include CrossLanguageSpotter
|
4
|
+
|
5
|
+
oracle_loader = OracleLoader.new
|
6
|
+
classifier = oracle_loader.build_weka_classifier('./test/data/angular_puzzle','./test/data/angular-puzzle.GS')
|
7
|
+
|
8
|
+
path = './test/data/angular_puzzle'
|
9
|
+
spotter = CrossLanguageSpotter::Spotter.new()
|
10
|
+
project = Project.new(path)
|
11
|
+
relations = spotter.classify_relations(project,classifier)
|
12
|
+
|
13
|
+
generate_report_file(relations,'resources/example.html')
|
@@ -0,0 +1,13 @@
|
|
1
|
+
$: << './lib'
|
2
|
+
require 'crosslanguagespotter'
|
3
|
+
include CrossLanguageSpotter
|
4
|
+
|
5
|
+
oracle_loader = OracleLoader.new
|
6
|
+
classifier = oracle_loader.build_weka_classifier('./test/data/angular_puzzle','./test/data/angular-puzzle.GS')
|
7
|
+
|
8
|
+
path = './test/data/services'
|
9
|
+
spotter = CrossLanguageSpotter::Spotter.new()
|
10
|
+
project = Project.new(path)
|
11
|
+
relations = spotter.classify_relations(project,classifier)
|
12
|
+
|
13
|
+
generate_report_file(relations,'resources/services_example.html')
|
@@ -0,0 +1,157 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require "codemodels"
|
4
|
+
require "codemodels/html"
|
5
|
+
require "codemodels/js"
|
6
|
+
|
7
|
+
require 'crosslanguagespotter/figures_evaluator'
|
8
|
+
require 'crosslanguagespotter/methods/context'
|
9
|
+
require 'crosslanguagespotter/methods/tversky'
|
10
|
+
require 'crosslanguagespotter/methods/jaro'
|
11
|
+
require 'crosslanguagespotter/model_loading'
|
12
|
+
require 'csv'
|
13
|
+
require 'set'
|
14
|
+
require 'crosslanguagespotter/jaccard'
|
15
|
+
|
16
|
+
module CrossLanguageSpotter
|
17
|
+
|
18
|
+
def self._load_models(dir,base_path='',models={})
|
19
|
+
Dir.foreach(dir) do |f|
|
20
|
+
if f!='.' and f!='..'
|
21
|
+
path = dir+'/'+f
|
22
|
+
if File.directory?(path)
|
23
|
+
_load_models(path,base_path+'/'+dir,models)
|
24
|
+
else
|
25
|
+
begin
|
26
|
+
models[base_path+'/'+f] = CodeModels.parse_file(path)
|
27
|
+
rescue Exception => e
|
28
|
+
puts "No model available for #{path}: #{e}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
return models
|
34
|
+
end
|
35
|
+
|
36
|
+
class Spotter
|
37
|
+
|
38
|
+
def initialize
|
39
|
+
@verbose = false
|
40
|
+
end
|
41
|
+
|
42
|
+
def find_relations(dir)
|
43
|
+
models = CrossLanguageSpotter._load_models(dir)
|
44
|
+
_calc(dir,models)
|
45
|
+
end
|
46
|
+
|
47
|
+
def features_for_dir(dir)
|
48
|
+
project = Project.new(dir,@verbose)
|
49
|
+
return features_for_project(project)
|
50
|
+
end
|
51
|
+
|
52
|
+
def classify_relations(project,classifier)
|
53
|
+
features_data = features_for_project(project)
|
54
|
+
data = []
|
55
|
+
list_of_original_features_rows = []
|
56
|
+
features_data.each do |rel,row|
|
57
|
+
row[:result] = false
|
58
|
+
data.push(row)
|
59
|
+
list_of_original_features_rows.push(row)
|
60
|
+
end
|
61
|
+
keys = {shared_length: :numeric,
|
62
|
+
tfidf_shared: :numeric,itfidf_shared: :numeric,
|
63
|
+
perc_shared_length_min: :numeric,
|
64
|
+
perc_shared_length_max: :numeric,
|
65
|
+
diff_min: :numeric,diff_max: :numeric,
|
66
|
+
perc_diff_min: :numeric,perc_diff_max: :numeric,
|
67
|
+
context: :numeric,jaccard: :numeric,jaro: :numeric,tversky: :numeric,
|
68
|
+
result: :boolean}
|
69
|
+
data_instances = hash2weka_instances("data",data,keys,:result)
|
70
|
+
classification = classifier.classify(data_instances)
|
71
|
+
|
72
|
+
i=0
|
73
|
+
results = []
|
74
|
+
classification.each do |c|
|
75
|
+
if c[:result]
|
76
|
+
# just put a true in the real relations
|
77
|
+
list_of_original_features_rows[i][:result] = true
|
78
|
+
results.push(list_of_original_features_rows[i])
|
79
|
+
end
|
80
|
+
i+=1
|
81
|
+
end
|
82
|
+
return results
|
83
|
+
end
|
84
|
+
|
85
|
+
def features_for_project(project)
|
86
|
+
results = {}
|
87
|
+
tversky_producer = TverskyReferencesProducer.new ({:alpha => 0.5, :threshold => 0.0})
|
88
|
+
context_producer = ContextReferencesProducer.new ({:alpha => 1.0, :threshold => 0.0})
|
89
|
+
context_points_map = context_producer.points_map(project)
|
90
|
+
jaro_producer = JaroReferencesProducer.new ({:winkleradjust=>false,:threshold=>0.0})
|
91
|
+
block = Proc.new do |ni,nj|
|
92
|
+
context_ni = context(ni).values & project.shared_ids
|
93
|
+
context_nj = context(nj).values & project.shared_ids
|
94
|
+
shared_length = (context_ni & context_nj).count
|
95
|
+
|
96
|
+
file_i = ni.source.artifact(:absolute).filename
|
97
|
+
file_j = ni.source.artifact(:absolute).filename
|
98
|
+
tfidf_shared = 0
|
99
|
+
itfidf_shared = 0
|
100
|
+
(context_ni & context_nj).each do |v|
|
101
|
+
tfidf_shared += project.tf_idf(file_i,v)+project.tf_idf(file_j,v)
|
102
|
+
itfidf_shared += project.itf_idf(file_i,v)+project.itf_idf(file_j,v)
|
103
|
+
end
|
104
|
+
|
105
|
+
perc_shared_length_i = context_ni.count==0 ? 0.0 : shared_length.to_f/context_ni.count.to_f
|
106
|
+
perc_shared_length_j = context_nj.count==0 ? 0.0 : shared_length.to_f/context_nj.count.to_f
|
107
|
+
perc_shared_length = [perc_shared_length_i,perc_shared_length_j]
|
108
|
+
perc_shared_length_min = (perc_shared_length[0]<perc_shared_length[1]) ? perc_shared_length[0] : perc_shared_length[1]
|
109
|
+
perc_shared_length_max = (perc_shared_length[0]<perc_shared_length[1]) ? perc_shared_length[1] : perc_shared_length[0]
|
110
|
+
diffs = [context_ni.count-shared_length,context_nj.count-shared_length]
|
111
|
+
diff_min = diffs.min
|
112
|
+
diff_max = diffs.max
|
113
|
+
perc_diff_i = context_ni.count==0 ? 0.0 : diffs[0].to_f/context_ni.count.to_f
|
114
|
+
perc_diff_j = context_nj.count==0 ? 0.0 : diffs[1].to_f/context_nj.count.to_f
|
115
|
+
perc_diffs = [perc_diff_i,perc_diff_j]
|
116
|
+
perc_diff_min = (perc_diffs[0]<perc_diffs[1]) ? perc_diffs[0] : perc_diffs[1]
|
117
|
+
perc_diff_max = (perc_diffs[0]<perc_diffs[1]) ? perc_diffs[1] : perc_diffs[0]
|
118
|
+
id_i = NodeId.from_node(ni)
|
119
|
+
id_j = NodeId.from_node(nj)
|
120
|
+
rel = CrossLanguageRelation.new([id_i,id_j])
|
121
|
+
|
122
|
+
jaccard = Jaccard.coefficient(context_ni,context_nj)
|
123
|
+
jaccard = 0.0 if jaccard.nan?
|
124
|
+
tversky = tversky_producer.tversky_coefficient(context_ni,context_nj)
|
125
|
+
tversky = 0.0 if tversky.nan?
|
126
|
+
context = context_points_map.points(Pair.new(ni,nj))
|
127
|
+
jaro = jaro_producer.jaro_coefficient_from_nodes(ni,nj)
|
128
|
+
jaro = 0.0 if jaro.nan?
|
129
|
+
|
130
|
+
results[rel] = {
|
131
|
+
node_a_file: ni.source.artifact(:absolute).filename,
|
132
|
+
node_a_begin_line: ni.source.position(:absolute).begin_line,
|
133
|
+
node_a_end_line: ni.source.position(:absolute).end_line,
|
134
|
+
node_a_begin_column: ni.source.position(:absolute).begin_column,
|
135
|
+
node_a_end_column: ni.source.position(:absolute).end_column,
|
136
|
+
|
137
|
+
node_b_file: nj.source.artifact(:absolute).filename,
|
138
|
+
node_b_begin_line: nj.source.position(:absolute).begin_line,
|
139
|
+
node_b_end_line: nj.source.position(:absolute).end_line,
|
140
|
+
node_b_begin_column: nj.source.position(:absolute).begin_column,
|
141
|
+
node_b_end_column: nj.source.position(:absolute).end_column,
|
142
|
+
|
143
|
+
shared_length:shared_length,
|
144
|
+
tfidf_shared:tfidf_shared,itfidf_shared:itfidf_shared,
|
145
|
+
perc_shared_length_min:perc_shared_length_min,
|
146
|
+
perc_shared_length_max:perc_shared_length_max,
|
147
|
+
diff_min:diff_min,diff_max:diff_max,
|
148
|
+
perc_diff_min:perc_diff_min,perc_diff_max:perc_diff_max,
|
149
|
+
context:context,jaccard:jaccard,jaro:jaro,tversky:tversky}
|
150
|
+
end
|
151
|
+
project.iter_over_shared_ids_instances {|ni,nj| block.call(ni,nj) }
|
152
|
+
return results
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
require 'crosslanguagespotter/model_loading'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
module CrossLanguageSpotter
|
5
|
+
|
6
|
+
def collect_values_with_declarator(node)
|
7
|
+
declarators_per_value = Hash.new {|h,k| h[k]=[]}
|
8
|
+
self.class.ecore.eAllAttributes.each do |a|
|
9
|
+
v = self.send(:"#{a.name}")
|
10
|
+
if v!=nil
|
11
|
+
if a.many
|
12
|
+
v.each {|el| values[el]+=1}
|
13
|
+
else
|
14
|
+
values[v]+=1
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
values
|
19
|
+
end
|
20
|
+
|
21
|
+
class Context
|
22
|
+
|
23
|
+
attr_reader :sequence_of_values
|
24
|
+
|
25
|
+
def initialize
|
26
|
+
@map = Hash.new {|h,k| h[k]=[]}
|
27
|
+
@sequence_of_values = []
|
28
|
+
@register_sequence = []
|
29
|
+
end
|
30
|
+
|
31
|
+
def values
|
32
|
+
@map.keys.select {|k| @map[k].count>0}
|
33
|
+
end
|
34
|
+
|
35
|
+
def count
|
36
|
+
values.count
|
37
|
+
end
|
38
|
+
|
39
|
+
def has_value?(v)
|
40
|
+
values.include?(v)
|
41
|
+
end
|
42
|
+
|
43
|
+
def declarators_per_value(value)
|
44
|
+
@map[value]
|
45
|
+
end
|
46
|
+
|
47
|
+
def sequence_of_values
|
48
|
+
@sequence_of_values
|
49
|
+
end
|
50
|
+
|
51
|
+
def register(value,declarator)
|
52
|
+
@sequence_of_values << value
|
53
|
+
@map[value] << declarator unless @map[value].include?(declarator)
|
54
|
+
@register_sequence << {value:value, declarator:value}
|
55
|
+
end
|
56
|
+
|
57
|
+
def merge(other)
|
58
|
+
other.values.each do |v|
|
59
|
+
other.declarators_per_value(v).each do |d|
|
60
|
+
register(v,d)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def clone
|
66
|
+
new_instance = Context.new
|
67
|
+
@register_sequence.each do |r|
|
68
|
+
new_instance.register(r[:value],r[:declarator])
|
69
|
+
end
|
70
|
+
new_instance
|
71
|
+
end
|
72
|
+
|
73
|
+
def intersection(values)
|
74
|
+
new_instance = self.clone
|
75
|
+
new_instance.intersection!(values)
|
76
|
+
new_instance
|
77
|
+
end
|
78
|
+
|
79
|
+
def intersection!(values)
|
80
|
+
@map.keys.each do |k|
|
81
|
+
if values.is_a? Array
|
82
|
+
@map[k] = [] unless values.include?(k)
|
83
|
+
elsif values.is_a? Context
|
84
|
+
if values.has_value?(k)
|
85
|
+
@map[k].concat(values.declarators_per_value(k))
|
86
|
+
else
|
87
|
+
@map[k] = []
|
88
|
+
end
|
89
|
+
else
|
90
|
+
raise "error"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
self
|
94
|
+
end
|
95
|
+
|
96
|
+
def count
|
97
|
+
values.count
|
98
|
+
end
|
99
|
+
|
100
|
+
def to_a
|
101
|
+
a = []
|
102
|
+
values.sort.each do |v|
|
103
|
+
a << {value:v,declarators:declarators_per_value(v)}
|
104
|
+
end
|
105
|
+
a
|
106
|
+
end
|
107
|
+
|
108
|
+
def to_s
|
109
|
+
to_a.to_s
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
|
114
|
+
def context(node)
|
115
|
+
ctx = Context.new
|
116
|
+
container = node.container_also_foreign
|
117
|
+
if container
|
118
|
+
ctx.merge(context(container))
|
119
|
+
|
120
|
+
# RGen attributes of the father
|
121
|
+
container.collect_values_with_count.keys.each do |value|
|
122
|
+
ctx.register(value,container)
|
123
|
+
end
|
124
|
+
|
125
|
+
# siblings in different containment reference
|
126
|
+
container.all_children_also_foreign.each do |sibling|
|
127
|
+
if (sibling.eContainingFeature!=node.eContainingFeature) || (node.eContainingFeature==nil && node!=sibling)
|
128
|
+
sibling.traverse(:also_foreign) do |n|
|
129
|
+
n.collect_values_with_count.keys.each do |value|
|
130
|
+
ctx.register(value,n)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
ctx
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
module CrossLanguageSpotter
|
2
|
+
|
3
|
+
class CrossLanguageReferencesProducer
|
4
|
+
|
5
|
+
def initialize(parameters)
|
6
|
+
end
|
7
|
+
|
8
|
+
# It should produce a set of CrossLanguageRelation
|
9
|
+
def produce_set(project)
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
|
14
|
+
# It compare different methods, each methods can be instantiated
|
15
|
+
# different times using different parameters
|
16
|
+
class CrossLanguageReferencesProducerMethodsComparator
|
17
|
+
# map per class, per params of the figures obtained agains the given gold set
|
18
|
+
attr_reader :results
|
19
|
+
|
20
|
+
def initialize(gold_set,project)
|
21
|
+
@gold_set = gold_set
|
22
|
+
@results = Hash.new {|h,k| h[k]={}}
|
23
|
+
@project = project
|
24
|
+
end
|
25
|
+
|
26
|
+
def add(clazz,parameters)
|
27
|
+
producer = clazz.new(parameters)
|
28
|
+
observed_set = producer.produce_set(@project)
|
29
|
+
fe = FiguresEvaluator.new(@gold_set,observed_set)
|
30
|
+
result = fe.all_figures
|
31
|
+
@results[clazz][parameters] = result
|
32
|
+
result
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
class NodeId
|
38
|
+
attr_reader :file
|
39
|
+
attr_reader :node_index
|
40
|
+
|
41
|
+
def index
|
42
|
+
@node_index
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.from_node(node)
|
46
|
+
new(node.source.artifact.final_host.filename,traverse_index(node))
|
47
|
+
end
|
48
|
+
|
49
|
+
def initialize(file,node_index)
|
50
|
+
@file = file
|
51
|
+
@node_index = node_index
|
52
|
+
end
|
53
|
+
|
54
|
+
def eql?(other)
|
55
|
+
return false unless other.is_a?(NodeId)
|
56
|
+
self.file.eql?(other.file) && self.node_index.eql?(other.node_index)
|
57
|
+
end
|
58
|
+
|
59
|
+
def ==(other)
|
60
|
+
return self.eql?(other)
|
61
|
+
end
|
62
|
+
|
63
|
+
def hash
|
64
|
+
@file.hash*7+@node_index.hash
|
65
|
+
end
|
66
|
+
|
67
|
+
def <=>(other)
|
68
|
+
res = self.file <=> other.file
|
69
|
+
if res==0
|
70
|
+
self.node_index <=> other.node_index
|
71
|
+
else
|
72
|
+
res
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def to_s
|
77
|
+
"#{@file}:#{@node_index}"
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
# It is a set of two node_ids (unordered)
|
83
|
+
class CrossLanguageRelation
|
84
|
+
attr_reader :node_ids
|
85
|
+
|
86
|
+
def initialize(node_ids)
|
87
|
+
raise "Two elements expected, #{node_ids.count} found" unless node_ids.count==2
|
88
|
+
node_id_a = node_ids[0]
|
89
|
+
node_id_b = node_ids[1]
|
90
|
+
if (node_id_a<=>node_id_b)<0
|
91
|
+
@node_ids = [node_id_a,node_id_b]
|
92
|
+
else
|
93
|
+
@node_ids = [node_id_b,node_id_a]
|
94
|
+
end
|
95
|
+
#puts "SORTING GAVE #{@node_ids}"
|
96
|
+
end
|
97
|
+
|
98
|
+
def eql?(other)
|
99
|
+
return false unless other.is_a?(CrossLanguageRelation)
|
100
|
+
self.node_ids.eql?(other.node_ids)
|
101
|
+
end
|
102
|
+
|
103
|
+
def ==(other)
|
104
|
+
return self.eql?(other)
|
105
|
+
end
|
106
|
+
|
107
|
+
def hash
|
108
|
+
@node_ids[0].hash*7+@node_ids[1].hash
|
109
|
+
end
|
110
|
+
|
111
|
+
def to_s
|
112
|
+
"CrossLanguageRelation #{@node_ids[0]} <-> #{@node_ids[1]}"
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
|
117
|
+
# Calculates precision, recall, f-measure
|
118
|
+
class FiguresEvaluator
|
119
|
+
|
120
|
+
# Gold set is the "truth", observed is calculated from
|
121
|
+
# some method and compared with the gold set
|
122
|
+
def initialize(gold_set,observed_set)
|
123
|
+
@gold_set = gold_set
|
124
|
+
@observed_set = observed_set
|
125
|
+
end
|
126
|
+
|
127
|
+
def precision
|
128
|
+
@precision = calc_precision unless @precision
|
129
|
+
@precision
|
130
|
+
end
|
131
|
+
|
132
|
+
def recall
|
133
|
+
@recall = calc_recall unless @recall
|
134
|
+
@recall
|
135
|
+
end
|
136
|
+
|
137
|
+
def f_measure(beta=1.0)
|
138
|
+
beta_square = beta**2.0
|
139
|
+
(2*(beta_square)*precision*recall)/(beta_square*precision+recall)
|
140
|
+
end
|
141
|
+
|
142
|
+
def all_figures(beta=1.0)
|
143
|
+
{precision:precision,recall:recall,f_measure:f_measure(beta),beta:beta}
|
144
|
+
end
|
145
|
+
|
146
|
+
private
|
147
|
+
|
148
|
+
def calc_precision
|
149
|
+
intersection_size = @gold_set.intersection(@observed_set).count.to_f
|
150
|
+
intersection_size/@observed_set.count.to_f
|
151
|
+
end
|
152
|
+
|
153
|
+
def calc_recall
|
154
|
+
intersection_size = @gold_set.intersection(@observed_set).count.to_f
|
155
|
+
intersection_size/@gold_set.count.to_f
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
require "set"
|
2
|
+
|
3
|
+
# Helpers to calculate the Jaccard Coefficient Index and related metrics easily.
|
4
|
+
#
|
5
|
+
# (from Wikipedia): The Jaccard coefficient measures similarity between sample sets, and is defined
|
6
|
+
# as the size of the intersection divided by the size of the union of the sample sets.
|
7
|
+
#
|
8
|
+
# The closer to 1.0 this number is, the more similar two items are.
|
9
|
+
module Jaccard
|
10
|
+
# Calculates the Jaccard Coefficient Index.
|
11
|
+
#
|
12
|
+
# +a+ must implement the set intersection and set union operators: <code>#&</code> and <code>#+</code>. Array and Set
|
13
|
+
# both implement these methods natively. It is expected that the results of <code>+</code> will either return a
|
14
|
+
# unique set or that it returns an object that responds to +#uniq!+. The results of +#coefficient+ will be
|
15
|
+
# wrong if the union contains duplicate elements.
|
16
|
+
#
|
17
|
+
# Also note that the individual items in +a+ and +b+ must implement a sane #eql? method.
|
18
|
+
# ActiveRecord::Base, String, Fixnum (but not Float), Array and Hash instances all implement
|
19
|
+
# a correct notion of equality. Other instances might have to be checked to ensure correct
|
20
|
+
# behavior.
|
21
|
+
#
|
22
|
+
# @param [#&, #+] a A set of items
|
23
|
+
# @param [#&, #+] b A second set of items
|
24
|
+
#
|
25
|
+
# @return [Float] The Jaccard Coefficient Index between +a+ and +b+.
|
26
|
+
#
|
27
|
+
# @example
|
28
|
+
#
|
29
|
+
# a = [1, 2, 3, 4]
|
30
|
+
# b = [1, 3, 4]
|
31
|
+
# Jaccard.coefficient(a, b) #=> 0.75
|
32
|
+
#
|
33
|
+
# @see http://en.wikipedia.org/wiki/Jaccard_index Jaccard Coefficient Index on Wikipedia.
|
34
|
+
def self.coefficient(a, b)
|
35
|
+
raise ArgumentError, "#{a.inspect} does not implement #&" unless a.respond_to?(:&)
|
36
|
+
raise ArgumentError, "#{a.inspect} does not implement #+" unless a.respond_to?(:+)
|
37
|
+
|
38
|
+
intersection = a & b
|
39
|
+
union = a + b
|
40
|
+
|
41
|
+
# Set does not implement #uniq or #uniq! since elements are
|
42
|
+
# always guaranteed to be present only once. That's the only
|
43
|
+
# reason we need to guard against that here.
|
44
|
+
union.uniq! if union.respond_to?(:uniq!)
|
45
|
+
|
46
|
+
intersection.length.to_f / union.length.to_f
|
47
|
+
end
|
48
|
+
|
49
|
+
# Calculates the inverse of the Jaccard coefficient.
|
50
|
+
#
|
51
|
+
# The closer to 0.0 the distance is, the more similar two items are.
|
52
|
+
#
|
53
|
+
# @return [Float] <code>1.0 - #coefficient(a, b)</code>
|
54
|
+
#
|
55
|
+
# @see Jaccard#coefficient for parameter calling convention and caveats about Array vs Set vs other object types.
|
56
|
+
def self.distance(a, b)
|
57
|
+
1.0 - coefficient(a, b)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Determines which member of +others+ has the smallest distance vs +a+.
|
61
|
+
#
|
62
|
+
# Because of the implementation, if multiple items from +others+ have
|
63
|
+
# the same distance, the last one will be returned. If this is undesirable,
|
64
|
+
# reverse +others+ before calling #closest_to.
|
65
|
+
#
|
66
|
+
# @param [#&, #+] a A set of attributes
|
67
|
+
# @param [#inject] others A collection of set of attributes
|
68
|
+
#
|
69
|
+
# @return The item from +others+ with the distance minimized to 0.0.
|
70
|
+
#
|
71
|
+
# @example
|
72
|
+
#
|
73
|
+
# a = [1, 2, 3]
|
74
|
+
# b = [1, 3]
|
75
|
+
# c = [1, 2, 3]
|
76
|
+
# Jaccard.closest_to(b, [a, c]) #=> [1, 2, 3]
|
77
|
+
# # Note that the actual instance returned will be c
|
78
|
+
def self.closest_to(a, others)
|
79
|
+
others.inject([2.0, nil]) do |memo, other|
|
80
|
+
dist = distance(a, other)
|
81
|
+
next memo if memo.first < dist
|
82
|
+
|
83
|
+
[dist, other]
|
84
|
+
end.last
|
85
|
+
end
|
86
|
+
|
87
|
+
# Returns the pair of items whose distance is minimized.
|
88
|
+
#
|
89
|
+
# @param [#each] items A collection of attributes.
|
90
|
+
#
|
91
|
+
# @return [Array<a, b>] A pair of set of attributes whose Jaccard distance is the minimal, given the input set.
|
92
|
+
#
|
93
|
+
# @example
|
94
|
+
#
|
95
|
+
# a = [1, 2, 3]
|
96
|
+
# b = [1, 2]
|
97
|
+
# c = [1, 3]
|
98
|
+
# Jaccard.best_match([a, b, c]) #=> [[1, 2, 3], [1, 2]]
|
99
|
+
def self.best_match(items)
|
100
|
+
seen = Set.new
|
101
|
+
matches = []
|
102
|
+
|
103
|
+
items.each do |row|
|
104
|
+
items.each do |col|
|
105
|
+
next if row == col
|
106
|
+
next if seen.include?([row, col]) || seen.include?([col, row])
|
107
|
+
seen << [row, col]
|
108
|
+
matches << [distance(row, col), [row, col]]
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
matches.sort.first.last
|
113
|
+
end
|
114
|
+
end
|