crosslanguagespotter 0.0.2-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (92) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/Gemfile +3 -0
  4. data/Rakefile +13 -0
  5. data/crosslanguagespotter.gemspec +36 -0
  6. data/examples/ex1.rb +13 -0
  7. data/examples/services_example.rb +13 -0
  8. data/lib/crosslanguagespotter/basic.rb +157 -0
  9. data/lib/crosslanguagespotter/context.rb +139 -0
  10. data/lib/crosslanguagespotter/figures_evaluator.rb +160 -0
  11. data/lib/crosslanguagespotter/jaccard.rb +114 -0
  12. data/lib/crosslanguagespotter/methods/context.rb +127 -0
  13. data/lib/crosslanguagespotter/methods/jaro.rb +118 -0
  14. data/lib/crosslanguagespotter/methods/tversky.rb +44 -0
  15. data/lib/crosslanguagespotter/model_loading.rb +333 -0
  16. data/lib/crosslanguagespotter/oracle.rb +261 -0
  17. data/lib/crosslanguagespotter/report.rb +88 -0
  18. data/lib/crosslanguagespotter/version.rb +5 -0
  19. data/lib/crosslanguagespotter/wekaintegration.rb +83 -0
  20. data/lib/crosslanguagespotter.rb +7 -0
  21. data/lib/jars/weka.jar +0 -0
  22. data/resources/css/bootstrap-theme.css +346 -0
  23. data/resources/css/bootstrap-theme.min.css +7 -0
  24. data/resources/css/bootstrap.css +5780 -0
  25. data/resources/css/bootstrap.min.css +7 -0
  26. data/resources/css/highlightstyles/arta.css +160 -0
  27. data/resources/css/highlightstyles/ascetic.css +50 -0
  28. data/resources/css/highlightstyles/atelier-dune.dark.css +93 -0
  29. data/resources/css/highlightstyles/atelier-dune.light.css +93 -0
  30. data/resources/css/highlightstyles/atelier-forest.dark.css +93 -0
  31. data/resources/css/highlightstyles/atelier-forest.light.css +93 -0
  32. data/resources/css/highlightstyles/atelier-heath.dark.css +93 -0
  33. data/resources/css/highlightstyles/atelier-heath.light.css +93 -0
  34. data/resources/css/highlightstyles/atelier-lakeside.dark.css +93 -0
  35. data/resources/css/highlightstyles/atelier-lakeside.light.css +93 -0
  36. data/resources/css/highlightstyles/atelier-seaside.dark.css +93 -0
  37. data/resources/css/highlightstyles/atelier-seaside.light.css +93 -0
  38. data/resources/css/highlightstyles/brown_paper.css +105 -0
  39. data/resources/css/highlightstyles/brown_papersq.png +0 -0
  40. data/resources/css/highlightstyles/dark.css +105 -0
  41. data/resources/css/highlightstyles/default.css +153 -0
  42. data/resources/css/highlightstyles/docco.css +132 -0
  43. data/resources/css/highlightstyles/far.css +113 -0
  44. data/resources/css/highlightstyles/foundation.css +133 -0
  45. data/resources/css/highlightstyles/github.css +125 -0
  46. data/resources/css/highlightstyles/googlecode.css +147 -0
  47. data/resources/css/highlightstyles/idea.css +122 -0
  48. data/resources/css/highlightstyles/ir_black.css +105 -0
  49. data/resources/css/highlightstyles/magula.css +123 -0
  50. data/resources/css/highlightstyles/mono-blue.css +62 -0
  51. data/resources/css/highlightstyles/monokai.css +127 -0
  52. data/resources/css/highlightstyles/monokai_sublime.css +149 -0
  53. data/resources/css/highlightstyles/obsidian.css +154 -0
  54. data/resources/css/highlightstyles/paraiso.dark.css +93 -0
  55. data/resources/css/highlightstyles/paraiso.light.css +93 -0
  56. data/resources/css/highlightstyles/pojoaque.css +106 -0
  57. data/resources/css/highlightstyles/pojoaque.jpg +0 -0
  58. data/resources/css/highlightstyles/railscasts.css +182 -0
  59. data/resources/css/highlightstyles/rainbow.css +112 -0
  60. data/resources/css/highlightstyles/school_book.css +113 -0
  61. data/resources/css/highlightstyles/school_book.png +0 -0
  62. data/resources/css/highlightstyles/solarized_dark.css +107 -0
  63. data/resources/css/highlightstyles/solarized_light.css +107 -0
  64. data/resources/css/highlightstyles/sunburst.css +160 -0
  65. data/resources/css/highlightstyles/tomorrow-night-blue.css +93 -0
  66. data/resources/css/highlightstyles/tomorrow-night-bright.css +92 -0
  67. data/resources/css/highlightstyles/tomorrow-night-eighties.css +92 -0
  68. data/resources/css/highlightstyles/tomorrow-night.css +93 -0
  69. data/resources/css/highlightstyles/tomorrow.css +90 -0
  70. data/resources/css/highlightstyles/vs.css +89 -0
  71. data/resources/css/highlightstyles/xcode.css +158 -0
  72. data/resources/css/highlightstyles/zenburn.css +117 -0
  73. data/resources/example.html +1501 -0
  74. data/resources/js/bootstrap.js +1943 -0
  75. data/resources/js/bootstrap.min.js +7 -0
  76. data/resources/js/highlight.pack.js +1 -0
  77. data/resources/services_example.html +141 -0
  78. data/resources/template.html +61 -0
  79. data/test/data/angular-puzzle.GS +111 -0
  80. data/test/data/angular_puzzle/app.js +66 -0
  81. data/test/data/angular_puzzle/index.html +67 -0
  82. data/test/data/angular_puzzle/slidingPuzzle.js +203 -0
  83. data/test/data/angular_puzzle/wordSearchPuzzle.js +270 -0
  84. data/test/data/example.html +5 -0
  85. data/test/data/example.js +4 -0
  86. data/test/data/services/index.html +33 -0
  87. data/test/data/services/script.js +15 -0
  88. data/test/test_helper.rb +9 -0
  89. data/test/test_parsing.rb +23 -0
  90. data/test/test_spotter.rb +42 -0
  91. data/test/test_wekaintegration.rb +43 -0
  92. metadata +328 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 68ee2d427d4dacc3d22b88a1deababeb2a1bcafd
4
+ data.tar.gz: 65ab29b3ba450f824a4930bd0a436170efabff57
5
+ SHA512:
6
+ metadata.gz: ec2f763115ece653ad3a1379f24397bd73e18f3d4d101b222a922fead038bb4498459319595ac86444e253176b66ca9ebe93f117632293f879ee1bbf2f348cb8
7
+ data.tar.gz: d7b1571637ec59db6e9519e035c4826fecde7d121b2e8f03898317f4eaf3cc237bb9253833f4f90357fa604fb1c0b31cfb9b1b27b496218b3066aa3034d4fae8
data/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ Gemfile.lock
2
+ *.gem
3
+ *.rbc
4
+ .bundle
5
+ .config
6
+ coverage
7
+ InstalledFiles
8
+ lib/bundler/man
9
+ pkg
10
+ rdoc
11
+ spec/reports
12
+ test/tmp
13
+ test/version_tmp
14
+ tmp
15
+
16
+ # YARD artifacts
17
+ .yardoc
18
+ _yardoc
19
+ doc/
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "http://rubygems.org"
2
+
3
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ require 'rake/testtask'
2
+ require 'rubygems/tasks'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'test'
6
+ end
7
+
8
+ Gem::Tasks.new do |tasks|
9
+ tasks.console.command = 'jruby'
10
+ end
11
+
12
+ desc "Run tests"
13
+ task :default => :test
@@ -0,0 +1,36 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'crosslanguagespotter/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.platform = 'java'
8
+ s.name = 'crosslanguagespotter'
9
+ s.version = CrossLanguageSpotter::VERSION
10
+ s.summary = "Automatic Spotter of Cross-Language references"
11
+ s.description = "Automatic Spotter of Cross-Language references"
12
+ s.authors = ["Federico Tomassetti"]
13
+ s.email = 'f.tomassetti@gmail.com'
14
+ s.homepage = 'https://github.com/CrossLanguageProject/crosslanguagerelationsspotter'
15
+ s.license = "Apache v2"
16
+
17
+ s.files = `git ls-files`.split($/)
18
+ s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency('codemodels')
23
+ s.add_dependency('codemodels-js')
24
+ s.add_dependency('codemodels-html')
25
+ s.add_dependency('codemodels-java')
26
+ s.add_dependency('codemodels-ruby')
27
+ s.add_dependency('codemodels-xml')
28
+ s.add_dependency('codemodels-properties')
29
+ s.add_dependency('htmlentities')
30
+ s.add_dependency('liquid')
31
+
32
+ s.add_development_dependency "bundler"
33
+ s.add_development_dependency "rake"
34
+ s.add_development_dependency "simplecov"
35
+ s.add_development_dependency "rubygems-tasks"
36
+ end
data/examples/ex1.rb ADDED
@@ -0,0 +1,13 @@
1
+ $: << './lib'
2
+ require 'crosslanguagespotter'
3
+ include CrossLanguageSpotter
4
+
5
+ oracle_loader = OracleLoader.new
6
+ classifier = oracle_loader.build_weka_classifier('./test/data/angular_puzzle','./test/data/angular-puzzle.GS')
7
+
8
+ path = './test/data/angular_puzzle'
9
+ spotter = CrossLanguageSpotter::Spotter.new()
10
+ project = Project.new(path)
11
+ relations = spotter.classify_relations(project,classifier)
12
+
13
+ generate_report_file(relations,'resources/example.html')
@@ -0,0 +1,13 @@
1
+ $: << './lib'
2
+ require 'crosslanguagespotter'
3
+ include CrossLanguageSpotter
4
+
5
+ oracle_loader = OracleLoader.new
6
+ classifier = oracle_loader.build_weka_classifier('./test/data/angular_puzzle','./test/data/angular-puzzle.GS')
7
+
8
+ path = './test/data/services'
9
+ spotter = CrossLanguageSpotter::Spotter.new()
10
+ project = Project.new(path)
11
+ relations = spotter.classify_relations(project,classifier)
12
+
13
+ generate_report_file(relations,'resources/services_example.html')
@@ -0,0 +1,157 @@
1
+ # encoding: utf-8
2
+
3
+ require "codemodels"
4
+ require "codemodels/html"
5
+ require "codemodels/js"
6
+
7
+ require 'crosslanguagespotter/figures_evaluator'
8
+ require 'crosslanguagespotter/methods/context'
9
+ require 'crosslanguagespotter/methods/tversky'
10
+ require 'crosslanguagespotter/methods/jaro'
11
+ require 'crosslanguagespotter/model_loading'
12
+ require 'csv'
13
+ require 'set'
14
+ require 'crosslanguagespotter/jaccard'
15
+
16
+ module CrossLanguageSpotter
17
+
18
+ def self._load_models(dir,base_path='',models={})
19
+ Dir.foreach(dir) do |f|
20
+ if f!='.' and f!='..'
21
+ path = dir+'/'+f
22
+ if File.directory?(path)
23
+ _load_models(path,base_path+'/'+dir,models)
24
+ else
25
+ begin
26
+ models[base_path+'/'+f] = CodeModels.parse_file(path)
27
+ rescue Exception => e
28
+ puts "No model available for #{path}: #{e}"
29
+ end
30
+ end
31
+ end
32
+ end
33
+ return models
34
+ end
35
+
36
+ class Spotter
37
+
38
+ def initialize
39
+ @verbose = false
40
+ end
41
+
42
+ def find_relations(dir)
43
+ models = CrossLanguageSpotter._load_models(dir)
44
+ _calc(dir,models)
45
+ end
46
+
47
+ def features_for_dir(dir)
48
+ project = Project.new(dir,@verbose)
49
+ return features_for_project(project)
50
+ end
51
+
52
+ def classify_relations(project,classifier)
53
+ features_data = features_for_project(project)
54
+ data = []
55
+ list_of_original_features_rows = []
56
+ features_data.each do |rel,row|
57
+ row[:result] = false
58
+ data.push(row)
59
+ list_of_original_features_rows.push(row)
60
+ end
61
+ keys = {shared_length: :numeric,
62
+ tfidf_shared: :numeric,itfidf_shared: :numeric,
63
+ perc_shared_length_min: :numeric,
64
+ perc_shared_length_max: :numeric,
65
+ diff_min: :numeric,diff_max: :numeric,
66
+ perc_diff_min: :numeric,perc_diff_max: :numeric,
67
+ context: :numeric,jaccard: :numeric,jaro: :numeric,tversky: :numeric,
68
+ result: :boolean}
69
+ data_instances = hash2weka_instances("data",data,keys,:result)
70
+ classification = classifier.classify(data_instances)
71
+
72
+ i=0
73
+ results = []
74
+ classification.each do |c|
75
+ if c[:result]
76
+ # just put a true in the real relations
77
+ list_of_original_features_rows[i][:result] = true
78
+ results.push(list_of_original_features_rows[i])
79
+ end
80
+ i+=1
81
+ end
82
+ return results
83
+ end
84
+
85
+ def features_for_project(project)
86
+ results = {}
87
+ tversky_producer = TverskyReferencesProducer.new ({:alpha => 0.5, :threshold => 0.0})
88
+ context_producer = ContextReferencesProducer.new ({:alpha => 1.0, :threshold => 0.0})
89
+ context_points_map = context_producer.points_map(project)
90
+ jaro_producer = JaroReferencesProducer.new ({:winkleradjust=>false,:threshold=>0.0})
91
+ block = Proc.new do |ni,nj|
92
+ context_ni = context(ni).values & project.shared_ids
93
+ context_nj = context(nj).values & project.shared_ids
94
+ shared_length = (context_ni & context_nj).count
95
+
96
+ file_i = ni.source.artifact(:absolute).filename
97
+ file_j = ni.source.artifact(:absolute).filename
98
+ tfidf_shared = 0
99
+ itfidf_shared = 0
100
+ (context_ni & context_nj).each do |v|
101
+ tfidf_shared += project.tf_idf(file_i,v)+project.tf_idf(file_j,v)
102
+ itfidf_shared += project.itf_idf(file_i,v)+project.itf_idf(file_j,v)
103
+ end
104
+
105
+ perc_shared_length_i = context_ni.count==0 ? 0.0 : shared_length.to_f/context_ni.count.to_f
106
+ perc_shared_length_j = context_nj.count==0 ? 0.0 : shared_length.to_f/context_nj.count.to_f
107
+ perc_shared_length = [perc_shared_length_i,perc_shared_length_j]
108
+ perc_shared_length_min = (perc_shared_length[0]<perc_shared_length[1]) ? perc_shared_length[0] : perc_shared_length[1]
109
+ perc_shared_length_max = (perc_shared_length[0]<perc_shared_length[1]) ? perc_shared_length[1] : perc_shared_length[0]
110
+ diffs = [context_ni.count-shared_length,context_nj.count-shared_length]
111
+ diff_min = diffs.min
112
+ diff_max = diffs.max
113
+ perc_diff_i = context_ni.count==0 ? 0.0 : diffs[0].to_f/context_ni.count.to_f
114
+ perc_diff_j = context_nj.count==0 ? 0.0 : diffs[1].to_f/context_nj.count.to_f
115
+ perc_diffs = [perc_diff_i,perc_diff_j]
116
+ perc_diff_min = (perc_diffs[0]<perc_diffs[1]) ? perc_diffs[0] : perc_diffs[1]
117
+ perc_diff_max = (perc_diffs[0]<perc_diffs[1]) ? perc_diffs[1] : perc_diffs[0]
118
+ id_i = NodeId.from_node(ni)
119
+ id_j = NodeId.from_node(nj)
120
+ rel = CrossLanguageRelation.new([id_i,id_j])
121
+
122
+ jaccard = Jaccard.coefficient(context_ni,context_nj)
123
+ jaccard = 0.0 if jaccard.nan?
124
+ tversky = tversky_producer.tversky_coefficient(context_ni,context_nj)
125
+ tversky = 0.0 if tversky.nan?
126
+ context = context_points_map.points(Pair.new(ni,nj))
127
+ jaro = jaro_producer.jaro_coefficient_from_nodes(ni,nj)
128
+ jaro = 0.0 if jaro.nan?
129
+
130
+ results[rel] = {
131
+ node_a_file: ni.source.artifact(:absolute).filename,
132
+ node_a_begin_line: ni.source.position(:absolute).begin_line,
133
+ node_a_end_line: ni.source.position(:absolute).end_line,
134
+ node_a_begin_column: ni.source.position(:absolute).begin_column,
135
+ node_a_end_column: ni.source.position(:absolute).end_column,
136
+
137
+ node_b_file: nj.source.artifact(:absolute).filename,
138
+ node_b_begin_line: nj.source.position(:absolute).begin_line,
139
+ node_b_end_line: nj.source.position(:absolute).end_line,
140
+ node_b_begin_column: nj.source.position(:absolute).begin_column,
141
+ node_b_end_column: nj.source.position(:absolute).end_column,
142
+
143
+ shared_length:shared_length,
144
+ tfidf_shared:tfidf_shared,itfidf_shared:itfidf_shared,
145
+ perc_shared_length_min:perc_shared_length_min,
146
+ perc_shared_length_max:perc_shared_length_max,
147
+ diff_min:diff_min,diff_max:diff_max,
148
+ perc_diff_min:perc_diff_min,perc_diff_max:perc_diff_max,
149
+ context:context,jaccard:jaccard,jaro:jaro,tversky:tversky}
150
+ end
151
+ project.iter_over_shared_ids_instances {|ni,nj| block.call(ni,nj) }
152
+ return results
153
+ end
154
+
155
+ end
156
+
157
+ end
@@ -0,0 +1,139 @@
1
+ require 'crosslanguagespotter/model_loading'
2
+ require 'set'
3
+
4
+ module CrossLanguageSpotter
5
+
6
+ def collect_values_with_declarator(node)
7
+ declarators_per_value = Hash.new {|h,k| h[k]=[]}
8
+ self.class.ecore.eAllAttributes.each do |a|
9
+ v = self.send(:"#{a.name}")
10
+ if v!=nil
11
+ if a.many
12
+ v.each {|el| values[el]+=1}
13
+ else
14
+ values[v]+=1
15
+ end
16
+ end
17
+ end
18
+ values
19
+ end
20
+
21
+ class Context
22
+
23
+ attr_reader :sequence_of_values
24
+
25
+ def initialize
26
+ @map = Hash.new {|h,k| h[k]=[]}
27
+ @sequence_of_values = []
28
+ @register_sequence = []
29
+ end
30
+
31
+ def values
32
+ @map.keys.select {|k| @map[k].count>0}
33
+ end
34
+
35
+ def count
36
+ values.count
37
+ end
38
+
39
+ def has_value?(v)
40
+ values.include?(v)
41
+ end
42
+
43
+ def declarators_per_value(value)
44
+ @map[value]
45
+ end
46
+
47
+ def sequence_of_values
48
+ @sequence_of_values
49
+ end
50
+
51
+ def register(value,declarator)
52
+ @sequence_of_values << value
53
+ @map[value] << declarator unless @map[value].include?(declarator)
54
+ @register_sequence << {value:value, declarator:value}
55
+ end
56
+
57
+ def merge(other)
58
+ other.values.each do |v|
59
+ other.declarators_per_value(v).each do |d|
60
+ register(v,d)
61
+ end
62
+ end
63
+ end
64
+
65
+ def clone
66
+ new_instance = Context.new
67
+ @register_sequence.each do |r|
68
+ new_instance.register(r[:value],r[:declarator])
69
+ end
70
+ new_instance
71
+ end
72
+
73
+ def intersection(values)
74
+ new_instance = self.clone
75
+ new_instance.intersection!(values)
76
+ new_instance
77
+ end
78
+
79
+ def intersection!(values)
80
+ @map.keys.each do |k|
81
+ if values.is_a? Array
82
+ @map[k] = [] unless values.include?(k)
83
+ elsif values.is_a? Context
84
+ if values.has_value?(k)
85
+ @map[k].concat(values.declarators_per_value(k))
86
+ else
87
+ @map[k] = []
88
+ end
89
+ else
90
+ raise "error"
91
+ end
92
+ end
93
+ self
94
+ end
95
+
96
+ def count
97
+ values.count
98
+ end
99
+
100
+ def to_a
101
+ a = []
102
+ values.sort.each do |v|
103
+ a << {value:v,declarators:declarators_per_value(v)}
104
+ end
105
+ a
106
+ end
107
+
108
+ def to_s
109
+ to_a.to_s
110
+ end
111
+
112
+ end
113
+
114
+ def context(node)
115
+ ctx = Context.new
116
+ container = node.container_also_foreign
117
+ if container
118
+ ctx.merge(context(container))
119
+
120
+ # RGen attributes of the father
121
+ container.collect_values_with_count.keys.each do |value|
122
+ ctx.register(value,container)
123
+ end
124
+
125
+ # siblings in different containment reference
126
+ container.all_children_also_foreign.each do |sibling|
127
+ if (sibling.eContainingFeature!=node.eContainingFeature) || (node.eContainingFeature==nil && node!=sibling)
128
+ sibling.traverse(:also_foreign) do |n|
129
+ n.collect_values_with_count.keys.each do |value|
130
+ ctx.register(value,n)
131
+ end
132
+ end
133
+ end
134
+ end
135
+ end
136
+ ctx
137
+ end
138
+
139
+ end
@@ -0,0 +1,160 @@
1
+ module CrossLanguageSpotter
2
+
3
+ class CrossLanguageReferencesProducer
4
+
5
+ def initialize(parameters)
6
+ end
7
+
8
+ # It should produce a set of CrossLanguageRelation
9
+ def produce_set(project)
10
+ end
11
+
12
+ end
13
+
14
+ # It compare different methods, each methods can be instantiated
15
+ # different times using different parameters
16
+ class CrossLanguageReferencesProducerMethodsComparator
17
+ # map per class, per params of the figures obtained agains the given gold set
18
+ attr_reader :results
19
+
20
+ def initialize(gold_set,project)
21
+ @gold_set = gold_set
22
+ @results = Hash.new {|h,k| h[k]={}}
23
+ @project = project
24
+ end
25
+
26
+ def add(clazz,parameters)
27
+ producer = clazz.new(parameters)
28
+ observed_set = producer.produce_set(@project)
29
+ fe = FiguresEvaluator.new(@gold_set,observed_set)
30
+ result = fe.all_figures
31
+ @results[clazz][parameters] = result
32
+ result
33
+ end
34
+
35
+ end
36
+
37
+ class NodeId
38
+ attr_reader :file
39
+ attr_reader :node_index
40
+
41
+ def index
42
+ @node_index
43
+ end
44
+
45
+ def self.from_node(node)
46
+ new(node.source.artifact.final_host.filename,traverse_index(node))
47
+ end
48
+
49
+ def initialize(file,node_index)
50
+ @file = file
51
+ @node_index = node_index
52
+ end
53
+
54
+ def eql?(other)
55
+ return false unless other.is_a?(NodeId)
56
+ self.file.eql?(other.file) && self.node_index.eql?(other.node_index)
57
+ end
58
+
59
+ def ==(other)
60
+ return self.eql?(other)
61
+ end
62
+
63
+ def hash
64
+ @file.hash*7+@node_index.hash
65
+ end
66
+
67
+ def <=>(other)
68
+ res = self.file <=> other.file
69
+ if res==0
70
+ self.node_index <=> other.node_index
71
+ else
72
+ res
73
+ end
74
+ end
75
+
76
+ def to_s
77
+ "#{@file}:#{@node_index}"
78
+ end
79
+
80
+ end
81
+
82
+ # It is a set of two node_ids (unordered)
83
+ class CrossLanguageRelation
84
+ attr_reader :node_ids
85
+
86
+ def initialize(node_ids)
87
+ raise "Two elements expected, #{node_ids.count} found" unless node_ids.count==2
88
+ node_id_a = node_ids[0]
89
+ node_id_b = node_ids[1]
90
+ if (node_id_a<=>node_id_b)<0
91
+ @node_ids = [node_id_a,node_id_b]
92
+ else
93
+ @node_ids = [node_id_b,node_id_a]
94
+ end
95
+ #puts "SORTING GAVE #{@node_ids}"
96
+ end
97
+
98
+ def eql?(other)
99
+ return false unless other.is_a?(CrossLanguageRelation)
100
+ self.node_ids.eql?(other.node_ids)
101
+ end
102
+
103
+ def ==(other)
104
+ return self.eql?(other)
105
+ end
106
+
107
+ def hash
108
+ @node_ids[0].hash*7+@node_ids[1].hash
109
+ end
110
+
111
+ def to_s
112
+ "CrossLanguageRelation #{@node_ids[0]} <-> #{@node_ids[1]}"
113
+ end
114
+
115
+ end
116
+
117
+ # Calculates precision, recall, f-measure
118
+ class FiguresEvaluator
119
+
120
+ # Gold set is the "truth", observed is calculated from
121
+ # some method and compared with the gold set
122
+ def initialize(gold_set,observed_set)
123
+ @gold_set = gold_set
124
+ @observed_set = observed_set
125
+ end
126
+
127
+ def precision
128
+ @precision = calc_precision unless @precision
129
+ @precision
130
+ end
131
+
132
+ def recall
133
+ @recall = calc_recall unless @recall
134
+ @recall
135
+ end
136
+
137
+ def f_measure(beta=1.0)
138
+ beta_square = beta**2.0
139
+ (2*(beta_square)*precision*recall)/(beta_square*precision+recall)
140
+ end
141
+
142
+ def all_figures(beta=1.0)
143
+ {precision:precision,recall:recall,f_measure:f_measure(beta),beta:beta}
144
+ end
145
+
146
+ private
147
+
148
+ def calc_precision
149
+ intersection_size = @gold_set.intersection(@observed_set).count.to_f
150
+ intersection_size/@observed_set.count.to_f
151
+ end
152
+
153
+ def calc_recall
154
+ intersection_size = @gold_set.intersection(@observed_set).count.to_f
155
+ intersection_size/@gold_set.count.to_f
156
+ end
157
+
158
+ end
159
+
160
+ end
@@ -0,0 +1,114 @@
1
+ require "set"
2
+
3
+ # Helpers to calculate the Jaccard Coefficient Index and related metrics easily.
4
+ #
5
+ # (from Wikipedia): The Jaccard coefficient measures similarity between sample sets, and is defined
6
+ # as the size of the intersection divided by the size of the union of the sample sets.
7
+ #
8
+ # The closer to 1.0 this number is, the more similar two items are.
9
+ module Jaccard
10
+ # Calculates the Jaccard Coefficient Index.
11
+ #
12
+ # +a+ must implement the set intersection and set union operators: <code>#&</code> and <code>#+</code>. Array and Set
13
+ # both implement these methods natively. It is expected that the results of <code>+</code> will either return a
14
+ # unique set or that it returns an object that responds to +#uniq!+. The results of +#coefficient+ will be
15
+ # wrong if the union contains duplicate elements.
16
+ #
17
+ # Also note that the individual items in +a+ and +b+ must implement a sane #eql? method.
18
+ # ActiveRecord::Base, String, Fixnum (but not Float), Array and Hash instances all implement
19
+ # a correct notion of equality. Other instances might have to be checked to ensure correct
20
+ # behavior.
21
+ #
22
+ # @param [#&, #+] a A set of items
23
+ # @param [#&, #+] b A second set of items
24
+ #
25
+ # @return [Float] The Jaccard Coefficient Index between +a+ and +b+.
26
+ #
27
+ # @example
28
+ #
29
+ # a = [1, 2, 3, 4]
30
+ # b = [1, 3, 4]
31
+ # Jaccard.coefficient(a, b) #=> 0.75
32
+ #
33
+ # @see http://en.wikipedia.org/wiki/Jaccard_index Jaccard Coefficient Index on Wikipedia.
34
+ def self.coefficient(a, b)
35
+ raise ArgumentError, "#{a.inspect} does not implement #&" unless a.respond_to?(:&)
36
+ raise ArgumentError, "#{a.inspect} does not implement #+" unless a.respond_to?(:+)
37
+
38
+ intersection = a & b
39
+ union = a + b
40
+
41
+ # Set does not implement #uniq or #uniq! since elements are
42
+ # always guaranteed to be present only once. That's the only
43
+ # reason we need to guard against that here.
44
+ union.uniq! if union.respond_to?(:uniq!)
45
+
46
+ intersection.length.to_f / union.length.to_f
47
+ end
48
+
49
+ # Calculates the inverse of the Jaccard coefficient.
50
+ #
51
+ # The closer to 0.0 the distance is, the more similar two items are.
52
+ #
53
+ # @return [Float] <code>1.0 - #coefficient(a, b)</code>
54
+ #
55
+ # @see Jaccard#coefficient for parameter calling convention and caveats about Array vs Set vs other object types.
56
+ def self.distance(a, b)
57
+ 1.0 - coefficient(a, b)
58
+ end
59
+
60
+ # Determines which member of +others+ has the smallest distance vs +a+.
61
+ #
62
+ # Because of the implementation, if multiple items from +others+ have
63
+ # the same distance, the last one will be returned. If this is undesirable,
64
+ # reverse +others+ before calling #closest_to.
65
+ #
66
+ # @param [#&, #+] a A set of attributes
67
+ # @param [#inject] others A collection of set of attributes
68
+ #
69
+ # @return The item from +others+ with the distance minimized to 0.0.
70
+ #
71
+ # @example
72
+ #
73
+ # a = [1, 2, 3]
74
+ # b = [1, 3]
75
+ # c = [1, 2, 3]
76
+ # Jaccard.closest_to(b, [a, c]) #=> [1, 2, 3]
77
+ # # Note that the actual instance returned will be c
78
+ def self.closest_to(a, others)
79
+ others.inject([2.0, nil]) do |memo, other|
80
+ dist = distance(a, other)
81
+ next memo if memo.first < dist
82
+
83
+ [dist, other]
84
+ end.last
85
+ end
86
+
87
+ # Returns the pair of items whose distance is minimized.
88
+ #
89
+ # @param [#each] items A collection of attributes.
90
+ #
91
+ # @return [Array<a, b>] A pair of set of attributes whose Jaccard distance is the minimal, given the input set.
92
+ #
93
+ # @example
94
+ #
95
+ # a = [1, 2, 3]
96
+ # b = [1, 2]
97
+ # c = [1, 3]
98
+ # Jaccard.best_match([a, b, c]) #=> [[1, 2, 3], [1, 2]]
99
+ def self.best_match(items)
100
+ seen = Set.new
101
+ matches = []
102
+
103
+ items.each do |row|
104
+ items.each do |col|
105
+ next if row == col
106
+ next if seen.include?([row, col]) || seen.include?([col, row])
107
+ seen << [row, col]
108
+ matches << [distance(row, col), [row, col]]
109
+ end
110
+ end
111
+
112
+ matches.sort.first.last
113
+ end
114
+ end