crosslanguagespotter 0.0.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/Gemfile +3 -0
  4. data/Rakefile +13 -0
  5. data/crosslanguagespotter.gemspec +36 -0
  6. data/examples/ex1.rb +13 -0
  7. data/examples/services_example.rb +13 -0
  8. data/lib/crosslanguagespotter/basic.rb +157 -0
  9. data/lib/crosslanguagespotter/context.rb +139 -0
  10. data/lib/crosslanguagespotter/figures_evaluator.rb +160 -0
  11. data/lib/crosslanguagespotter/jaccard.rb +114 -0
  12. data/lib/crosslanguagespotter/methods/context.rb +127 -0
  13. data/lib/crosslanguagespotter/methods/jaro.rb +118 -0
  14. data/lib/crosslanguagespotter/methods/tversky.rb +44 -0
  15. data/lib/crosslanguagespotter/model_loading.rb +333 -0
  16. data/lib/crosslanguagespotter/oracle.rb +261 -0
  17. data/lib/crosslanguagespotter/report.rb +88 -0
  18. data/lib/crosslanguagespotter/version.rb +5 -0
  19. data/lib/crosslanguagespotter/wekaintegration.rb +83 -0
  20. data/lib/crosslanguagespotter.rb +7 -0
  21. data/lib/jars/weka.jar +0 -0
  22. data/resources/css/bootstrap-theme.css +346 -0
  23. data/resources/css/bootstrap-theme.min.css +7 -0
  24. data/resources/css/bootstrap.css +5780 -0
  25. data/resources/css/bootstrap.min.css +7 -0
  26. data/resources/css/highlightstyles/arta.css +160 -0
  27. data/resources/css/highlightstyles/ascetic.css +50 -0
  28. data/resources/css/highlightstyles/atelier-dune.dark.css +93 -0
  29. data/resources/css/highlightstyles/atelier-dune.light.css +93 -0
  30. data/resources/css/highlightstyles/atelier-forest.dark.css +93 -0
  31. data/resources/css/highlightstyles/atelier-forest.light.css +93 -0
  32. data/resources/css/highlightstyles/atelier-heath.dark.css +93 -0
  33. data/resources/css/highlightstyles/atelier-heath.light.css +93 -0
  34. data/resources/css/highlightstyles/atelier-lakeside.dark.css +93 -0
  35. data/resources/css/highlightstyles/atelier-lakeside.light.css +93 -0
  36. data/resources/css/highlightstyles/atelier-seaside.dark.css +93 -0
  37. data/resources/css/highlightstyles/atelier-seaside.light.css +93 -0
  38. data/resources/css/highlightstyles/brown_paper.css +105 -0
  39. data/resources/css/highlightstyles/brown_papersq.png +0 -0
  40. data/resources/css/highlightstyles/dark.css +105 -0
  41. data/resources/css/highlightstyles/default.css +153 -0
  42. data/resources/css/highlightstyles/docco.css +132 -0
  43. data/resources/css/highlightstyles/far.css +113 -0
  44. data/resources/css/highlightstyles/foundation.css +133 -0
  45. data/resources/css/highlightstyles/github.css +125 -0
  46. data/resources/css/highlightstyles/googlecode.css +147 -0
  47. data/resources/css/highlightstyles/idea.css +122 -0
  48. data/resources/css/highlightstyles/ir_black.css +105 -0
  49. data/resources/css/highlightstyles/magula.css +123 -0
  50. data/resources/css/highlightstyles/mono-blue.css +62 -0
  51. data/resources/css/highlightstyles/monokai.css +127 -0
  52. data/resources/css/highlightstyles/monokai_sublime.css +149 -0
  53. data/resources/css/highlightstyles/obsidian.css +154 -0
  54. data/resources/css/highlightstyles/paraiso.dark.css +93 -0
  55. data/resources/css/highlightstyles/paraiso.light.css +93 -0
  56. data/resources/css/highlightstyles/pojoaque.css +106 -0
  57. data/resources/css/highlightstyles/pojoaque.jpg +0 -0
  58. data/resources/css/highlightstyles/railscasts.css +182 -0
  59. data/resources/css/highlightstyles/rainbow.css +112 -0
  60. data/resources/css/highlightstyles/school_book.css +113 -0
  61. data/resources/css/highlightstyles/school_book.png +0 -0
  62. data/resources/css/highlightstyles/solarized_dark.css +107 -0
  63. data/resources/css/highlightstyles/solarized_light.css +107 -0
  64. data/resources/css/highlightstyles/sunburst.css +160 -0
  65. data/resources/css/highlightstyles/tomorrow-night-blue.css +93 -0
  66. data/resources/css/highlightstyles/tomorrow-night-bright.css +92 -0
  67. data/resources/css/highlightstyles/tomorrow-night-eighties.css +92 -0
  68. data/resources/css/highlightstyles/tomorrow-night.css +93 -0
  69. data/resources/css/highlightstyles/tomorrow.css +90 -0
  70. data/resources/css/highlightstyles/vs.css +89 -0
  71. data/resources/css/highlightstyles/xcode.css +158 -0
  72. data/resources/css/highlightstyles/zenburn.css +117 -0
  73. data/resources/example.html +1501 -0
  74. data/resources/js/bootstrap.js +1943 -0
  75. data/resources/js/bootstrap.min.js +7 -0
  76. data/resources/js/highlight.pack.js +1 -0
  77. data/resources/services_example.html +141 -0
  78. data/resources/template.html +61 -0
  79. data/test/data/angular-puzzle.GS +111 -0
  80. data/test/data/angular_puzzle/app.js +66 -0
  81. data/test/data/angular_puzzle/index.html +67 -0
  82. data/test/data/angular_puzzle/slidingPuzzle.js +203 -0
  83. data/test/data/angular_puzzle/wordSearchPuzzle.js +270 -0
  84. data/test/data/example.html +5 -0
  85. data/test/data/example.js +4 -0
  86. data/test/data/services/index.html +33 -0
  87. data/test/data/services/script.js +15 -0
  88. data/test/test_helper.rb +9 -0
  89. data/test/test_parsing.rb +23 -0
  90. data/test/test_spotter.rb +42 -0
  91. data/test/test_wekaintegration.rb +43 -0
  92. metadata +328 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 68ee2d427d4dacc3d22b88a1deababeb2a1bcafd
4
+ data.tar.gz: 65ab29b3ba450f824a4930bd0a436170efabff57
5
+ SHA512:
6
+ metadata.gz: ec2f763115ece653ad3a1379f24397bd73e18f3d4d101b222a922fead038bb4498459319595ac86444e253176b66ca9ebe93f117632293f879ee1bbf2f348cb8
7
+ data.tar.gz: d7b1571637ec59db6e9519e035c4826fecde7d121b2e8f03898317f4eaf3cc237bb9253833f4f90357fa604fb1c0b31cfb9b1b27b496218b3066aa3034d4fae8
data/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ Gemfile.lock
2
+ *.gem
3
+ *.rbc
4
+ .bundle
5
+ .config
6
+ coverage
7
+ InstalledFiles
8
+ lib/bundler/man
9
+ pkg
10
+ rdoc
11
+ spec/reports
12
+ test/tmp
13
+ test/version_tmp
14
+ tmp
15
+
16
+ # YARD artifacts
17
+ .yardoc
18
+ _yardoc
19
+ doc/
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "http://rubygems.org"
2
+
3
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ require 'rake/testtask'
2
+ require 'rubygems/tasks'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'test'
6
+ end
7
+
8
+ Gem::Tasks.new do |tasks|
9
+ tasks.console.command = 'jruby'
10
+ end
11
+
12
+ desc "Run tests"
13
+ task :default => :test
@@ -0,0 +1,36 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'crosslanguagespotter/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.platform = 'java'
8
+ s.name = 'crosslanguagespotter'
9
+ s.version = CrossLanguageSpotter::VERSION
10
+ s.summary = "Automatic Spotter of Cross-Language references"
11
+ s.description = "Automatic Spotter of Cross-Language references"
12
+ s.authors = ["Federico Tomassetti"]
13
+ s.email = 'f.tomassetti@gmail.com'
14
+ s.homepage = 'https://github.com/CrossLanguageProject/crosslanguagerelationsspotter'
15
+ s.license = "Apache v2"
16
+
17
+ s.files = `git ls-files`.split($/)
18
+ s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency('codemodels')
23
+ s.add_dependency('codemodels-js')
24
+ s.add_dependency('codemodels-html')
25
+ s.add_dependency('codemodels-java')
26
+ s.add_dependency('codemodels-ruby')
27
+ s.add_dependency('codemodels-xml')
28
+ s.add_dependency('codemodels-properties')
29
+ s.add_dependency('htmlentities')
30
+ s.add_dependency('liquid')
31
+
32
+ s.add_development_dependency "bundler"
33
+ s.add_development_dependency "rake"
34
+ s.add_development_dependency "simplecov"
35
+ s.add_development_dependency "rubygems-tasks"
36
+ end
data/examples/ex1.rb ADDED
@@ -0,0 +1,13 @@
1
+ $: << './lib'
2
+ require 'crosslanguagespotter'
3
+ include CrossLanguageSpotter
4
+
5
+ oracle_loader = OracleLoader.new
6
+ classifier = oracle_loader.build_weka_classifier('./test/data/angular_puzzle','./test/data/angular-puzzle.GS')
7
+
8
+ path = './test/data/angular_puzzle'
9
+ spotter = CrossLanguageSpotter::Spotter.new()
10
+ project = Project.new(path)
11
+ relations = spotter.classify_relations(project,classifier)
12
+
13
+ generate_report_file(relations,'resources/example.html')
@@ -0,0 +1,13 @@
1
+ $: << './lib'
2
+ require 'crosslanguagespotter'
3
+ include CrossLanguageSpotter
4
+
5
+ oracle_loader = OracleLoader.new
6
+ classifier = oracle_loader.build_weka_classifier('./test/data/angular_puzzle','./test/data/angular-puzzle.GS')
7
+
8
+ path = './test/data/services'
9
+ spotter = CrossLanguageSpotter::Spotter.new()
10
+ project = Project.new(path)
11
+ relations = spotter.classify_relations(project,classifier)
12
+
13
+ generate_report_file(relations,'resources/services_example.html')
@@ -0,0 +1,157 @@
1
+ # encoding: utf-8
2
+
3
+ require "codemodels"
4
+ require "codemodels/html"
5
+ require "codemodels/js"
6
+
7
+ require 'crosslanguagespotter/figures_evaluator'
8
+ require 'crosslanguagespotter/methods/context'
9
+ require 'crosslanguagespotter/methods/tversky'
10
+ require 'crosslanguagespotter/methods/jaro'
11
+ require 'crosslanguagespotter/model_loading'
12
+ require 'csv'
13
+ require 'set'
14
+ require 'crosslanguagespotter/jaccard'
15
+
16
+ module CrossLanguageSpotter
17
+
18
+ def self._load_models(dir,base_path='',models={})
19
+ Dir.foreach(dir) do |f|
20
+ if f!='.' and f!='..'
21
+ path = dir+'/'+f
22
+ if File.directory?(path)
23
+ _load_models(path,base_path+'/'+dir,models)
24
+ else
25
+ begin
26
+ models[base_path+'/'+f] = CodeModels.parse_file(path)
27
+ rescue Exception => e
28
+ puts "No model available for #{path}: #{e}"
29
+ end
30
+ end
31
+ end
32
+ end
33
+ return models
34
+ end
35
+
36
+ class Spotter
37
+
38
+ def initialize
39
+ @verbose = false
40
+ end
41
+
42
+ def find_relations(dir)
43
+ models = CrossLanguageSpotter._load_models(dir)
44
+ _calc(dir,models)
45
+ end
46
+
47
+ def features_for_dir(dir)
48
+ project = Project.new(dir,@verbose)
49
+ return features_for_project(project)
50
+ end
51
+
52
+ def classify_relations(project,classifier)
53
+ features_data = features_for_project(project)
54
+ data = []
55
+ list_of_original_features_rows = []
56
+ features_data.each do |rel,row|
57
+ row[:result] = false
58
+ data.push(row)
59
+ list_of_original_features_rows.push(row)
60
+ end
61
+ keys = {shared_length: :numeric,
62
+ tfidf_shared: :numeric,itfidf_shared: :numeric,
63
+ perc_shared_length_min: :numeric,
64
+ perc_shared_length_max: :numeric,
65
+ diff_min: :numeric,diff_max: :numeric,
66
+ perc_diff_min: :numeric,perc_diff_max: :numeric,
67
+ context: :numeric,jaccard: :numeric,jaro: :numeric,tversky: :numeric,
68
+ result: :boolean}
69
+ data_instances = hash2weka_instances("data",data,keys,:result)
70
+ classification = classifier.classify(data_instances)
71
+
72
+ i=0
73
+ results = []
74
+ classification.each do |c|
75
+ if c[:result]
76
+ # just put a true in the real relations
77
+ list_of_original_features_rows[i][:result] = true
78
+ results.push(list_of_original_features_rows[i])
79
+ end
80
+ i+=1
81
+ end
82
+ return results
83
+ end
84
+
85
+ def features_for_project(project)
86
+ results = {}
87
+ tversky_producer = TverskyReferencesProducer.new ({:alpha => 0.5, :threshold => 0.0})
88
+ context_producer = ContextReferencesProducer.new ({:alpha => 1.0, :threshold => 0.0})
89
+ context_points_map = context_producer.points_map(project)
90
+ jaro_producer = JaroReferencesProducer.new ({:winkleradjust=>false,:threshold=>0.0})
91
+ block = Proc.new do |ni,nj|
92
+ context_ni = context(ni).values & project.shared_ids
93
+ context_nj = context(nj).values & project.shared_ids
94
+ shared_length = (context_ni & context_nj).count
95
+
96
+ file_i = ni.source.artifact(:absolute).filename
97
+ file_j = ni.source.artifact(:absolute).filename
98
+ tfidf_shared = 0
99
+ itfidf_shared = 0
100
+ (context_ni & context_nj).each do |v|
101
+ tfidf_shared += project.tf_idf(file_i,v)+project.tf_idf(file_j,v)
102
+ itfidf_shared += project.itf_idf(file_i,v)+project.itf_idf(file_j,v)
103
+ end
104
+
105
+ perc_shared_length_i = context_ni.count==0 ? 0.0 : shared_length.to_f/context_ni.count.to_f
106
+ perc_shared_length_j = context_nj.count==0 ? 0.0 : shared_length.to_f/context_nj.count.to_f
107
+ perc_shared_length = [perc_shared_length_i,perc_shared_length_j]
108
+ perc_shared_length_min = (perc_shared_length[0]<perc_shared_length[1]) ? perc_shared_length[0] : perc_shared_length[1]
109
+ perc_shared_length_max = (perc_shared_length[0]<perc_shared_length[1]) ? perc_shared_length[1] : perc_shared_length[0]
110
+ diffs = [context_ni.count-shared_length,context_nj.count-shared_length]
111
+ diff_min = diffs.min
112
+ diff_max = diffs.max
113
+ perc_diff_i = context_ni.count==0 ? 0.0 : diffs[0].to_f/context_ni.count.to_f
114
+ perc_diff_j = context_nj.count==0 ? 0.0 : diffs[1].to_f/context_nj.count.to_f
115
+ perc_diffs = [perc_diff_i,perc_diff_j]
116
+ perc_diff_min = (perc_diffs[0]<perc_diffs[1]) ? perc_diffs[0] : perc_diffs[1]
117
+ perc_diff_max = (perc_diffs[0]<perc_diffs[1]) ? perc_diffs[1] : perc_diffs[0]
118
+ id_i = NodeId.from_node(ni)
119
+ id_j = NodeId.from_node(nj)
120
+ rel = CrossLanguageRelation.new([id_i,id_j])
121
+
122
+ jaccard = Jaccard.coefficient(context_ni,context_nj)
123
+ jaccard = 0.0 if jaccard.nan?
124
+ tversky = tversky_producer.tversky_coefficient(context_ni,context_nj)
125
+ tversky = 0.0 if tversky.nan?
126
+ context = context_points_map.points(Pair.new(ni,nj))
127
+ jaro = jaro_producer.jaro_coefficient_from_nodes(ni,nj)
128
+ jaro = 0.0 if jaro.nan?
129
+
130
+ results[rel] = {
131
+ node_a_file: ni.source.artifact(:absolute).filename,
132
+ node_a_begin_line: ni.source.position(:absolute).begin_line,
133
+ node_a_end_line: ni.source.position(:absolute).end_line,
134
+ node_a_begin_column: ni.source.position(:absolute).begin_column,
135
+ node_a_end_column: ni.source.position(:absolute).end_column,
136
+
137
+ node_b_file: nj.source.artifact(:absolute).filename,
138
+ node_b_begin_line: nj.source.position(:absolute).begin_line,
139
+ node_b_end_line: nj.source.position(:absolute).end_line,
140
+ node_b_begin_column: nj.source.position(:absolute).begin_column,
141
+ node_b_end_column: nj.source.position(:absolute).end_column,
142
+
143
+ shared_length:shared_length,
144
+ tfidf_shared:tfidf_shared,itfidf_shared:itfidf_shared,
145
+ perc_shared_length_min:perc_shared_length_min,
146
+ perc_shared_length_max:perc_shared_length_max,
147
+ diff_min:diff_min,diff_max:diff_max,
148
+ perc_diff_min:perc_diff_min,perc_diff_max:perc_diff_max,
149
+ context:context,jaccard:jaccard,jaro:jaro,tversky:tversky}
150
+ end
151
+ project.iter_over_shared_ids_instances {|ni,nj| block.call(ni,nj) }
152
+ return results
153
+ end
154
+
155
+ end
156
+
157
+ end
@@ -0,0 +1,139 @@
1
+ require 'crosslanguagespotter/model_loading'
2
+ require 'set'
3
+
4
+ module CrossLanguageSpotter
5
+
6
+ def collect_values_with_declarator(node)
7
+ declarators_per_value = Hash.new {|h,k| h[k]=[]}
8
+ self.class.ecore.eAllAttributes.each do |a|
9
+ v = self.send(:"#{a.name}")
10
+ if v!=nil
11
+ if a.many
12
+ v.each {|el| values[el]+=1}
13
+ else
14
+ values[v]+=1
15
+ end
16
+ end
17
+ end
18
+ values
19
+ end
20
+
21
+ class Context
22
+
23
+ attr_reader :sequence_of_values
24
+
25
+ def initialize
26
+ @map = Hash.new {|h,k| h[k]=[]}
27
+ @sequence_of_values = []
28
+ @register_sequence = []
29
+ end
30
+
31
+ def values
32
+ @map.keys.select {|k| @map[k].count>0}
33
+ end
34
+
35
+ def count
36
+ values.count
37
+ end
38
+
39
+ def has_value?(v)
40
+ values.include?(v)
41
+ end
42
+
43
+ def declarators_per_value(value)
44
+ @map[value]
45
+ end
46
+
47
+ def sequence_of_values
48
+ @sequence_of_values
49
+ end
50
+
51
+ def register(value,declarator)
52
+ @sequence_of_values << value
53
+ @map[value] << declarator unless @map[value].include?(declarator)
54
+ @register_sequence << {value:value, declarator:value}
55
+ end
56
+
57
+ def merge(other)
58
+ other.values.each do |v|
59
+ other.declarators_per_value(v).each do |d|
60
+ register(v,d)
61
+ end
62
+ end
63
+ end
64
+
65
+ def clone
66
+ new_instance = Context.new
67
+ @register_sequence.each do |r|
68
+ new_instance.register(r[:value],r[:declarator])
69
+ end
70
+ new_instance
71
+ end
72
+
73
+ def intersection(values)
74
+ new_instance = self.clone
75
+ new_instance.intersection!(values)
76
+ new_instance
77
+ end
78
+
79
+ def intersection!(values)
80
+ @map.keys.each do |k|
81
+ if values.is_a? Array
82
+ @map[k] = [] unless values.include?(k)
83
+ elsif values.is_a? Context
84
+ if values.has_value?(k)
85
+ @map[k].concat(values.declarators_per_value(k))
86
+ else
87
+ @map[k] = []
88
+ end
89
+ else
90
+ raise "error"
91
+ end
92
+ end
93
+ self
94
+ end
95
+
96
+ def count
97
+ values.count
98
+ end
99
+
100
+ def to_a
101
+ a = []
102
+ values.sort.each do |v|
103
+ a << {value:v,declarators:declarators_per_value(v)}
104
+ end
105
+ a
106
+ end
107
+
108
+ def to_s
109
+ to_a.to_s
110
+ end
111
+
112
+ end
113
+
114
+ def context(node)
115
+ ctx = Context.new
116
+ container = node.container_also_foreign
117
+ if container
118
+ ctx.merge(context(container))
119
+
120
+ # RGen attributes of the father
121
+ container.collect_values_with_count.keys.each do |value|
122
+ ctx.register(value,container)
123
+ end
124
+
125
+ # siblings in different containment reference
126
+ container.all_children_also_foreign.each do |sibling|
127
+ if (sibling.eContainingFeature!=node.eContainingFeature) || (node.eContainingFeature==nil && node!=sibling)
128
+ sibling.traverse(:also_foreign) do |n|
129
+ n.collect_values_with_count.keys.each do |value|
130
+ ctx.register(value,n)
131
+ end
132
+ end
133
+ end
134
+ end
135
+ end
136
+ ctx
137
+ end
138
+
139
+ end
@@ -0,0 +1,160 @@
1
+ module CrossLanguageSpotter
2
+
3
+ class CrossLanguageReferencesProducer
4
+
5
+ def initialize(parameters)
6
+ end
7
+
8
+ # It should produce a set of CrossLanguageRelation
9
+ def produce_set(project)
10
+ end
11
+
12
+ end
13
+
14
+ # It compare different methods, each methods can be instantiated
15
+ # different times using different parameters
16
+ class CrossLanguageReferencesProducerMethodsComparator
17
+ # map per class, per params of the figures obtained agains the given gold set
18
+ attr_reader :results
19
+
20
+ def initialize(gold_set,project)
21
+ @gold_set = gold_set
22
+ @results = Hash.new {|h,k| h[k]={}}
23
+ @project = project
24
+ end
25
+
26
+ def add(clazz,parameters)
27
+ producer = clazz.new(parameters)
28
+ observed_set = producer.produce_set(@project)
29
+ fe = FiguresEvaluator.new(@gold_set,observed_set)
30
+ result = fe.all_figures
31
+ @results[clazz][parameters] = result
32
+ result
33
+ end
34
+
35
+ end
36
+
37
+ class NodeId
38
+ attr_reader :file
39
+ attr_reader :node_index
40
+
41
+ def index
42
+ @node_index
43
+ end
44
+
45
+ def self.from_node(node)
46
+ new(node.source.artifact.final_host.filename,traverse_index(node))
47
+ end
48
+
49
+ def initialize(file,node_index)
50
+ @file = file
51
+ @node_index = node_index
52
+ end
53
+
54
+ def eql?(other)
55
+ return false unless other.is_a?(NodeId)
56
+ self.file.eql?(other.file) && self.node_index.eql?(other.node_index)
57
+ end
58
+
59
+ def ==(other)
60
+ return self.eql?(other)
61
+ end
62
+
63
+ def hash
64
+ @file.hash*7+@node_index.hash
65
+ end
66
+
67
+ def <=>(other)
68
+ res = self.file <=> other.file
69
+ if res==0
70
+ self.node_index <=> other.node_index
71
+ else
72
+ res
73
+ end
74
+ end
75
+
76
+ def to_s
77
+ "#{@file}:#{@node_index}"
78
+ end
79
+
80
+ end
81
+
82
+ # It is a set of two node_ids (unordered)
83
+ class CrossLanguageRelation
84
+ attr_reader :node_ids
85
+
86
+ def initialize(node_ids)
87
+ raise "Two elements expected, #{node_ids.count} found" unless node_ids.count==2
88
+ node_id_a = node_ids[0]
89
+ node_id_b = node_ids[1]
90
+ if (node_id_a<=>node_id_b)<0
91
+ @node_ids = [node_id_a,node_id_b]
92
+ else
93
+ @node_ids = [node_id_b,node_id_a]
94
+ end
95
+ #puts "SORTING GAVE #{@node_ids}"
96
+ end
97
+
98
+ def eql?(other)
99
+ return false unless other.is_a?(CrossLanguageRelation)
100
+ self.node_ids.eql?(other.node_ids)
101
+ end
102
+
103
+ def ==(other)
104
+ return self.eql?(other)
105
+ end
106
+
107
+ def hash
108
+ @node_ids[0].hash*7+@node_ids[1].hash
109
+ end
110
+
111
+ def to_s
112
+ "CrossLanguageRelation #{@node_ids[0]} <-> #{@node_ids[1]}"
113
+ end
114
+
115
+ end
116
+
117
+ # Calculates precision, recall, f-measure
118
+ class FiguresEvaluator
119
+
120
+ # Gold set is the "truth", observed is calculated from
121
+ # some method and compared with the gold set
122
+ def initialize(gold_set,observed_set)
123
+ @gold_set = gold_set
124
+ @observed_set = observed_set
125
+ end
126
+
127
+ def precision
128
+ @precision = calc_precision unless @precision
129
+ @precision
130
+ end
131
+
132
+ def recall
133
+ @recall = calc_recall unless @recall
134
+ @recall
135
+ end
136
+
137
+ def f_measure(beta=1.0)
138
+ beta_square = beta**2.0
139
+ (2*(beta_square)*precision*recall)/(beta_square*precision+recall)
140
+ end
141
+
142
+ def all_figures(beta=1.0)
143
+ {precision:precision,recall:recall,f_measure:f_measure(beta),beta:beta}
144
+ end
145
+
146
+ private
147
+
148
+ def calc_precision
149
+ intersection_size = @gold_set.intersection(@observed_set).count.to_f
150
+ intersection_size/@observed_set.count.to_f
151
+ end
152
+
153
+ def calc_recall
154
+ intersection_size = @gold_set.intersection(@observed_set).count.to_f
155
+ intersection_size/@gold_set.count.to_f
156
+ end
157
+
158
+ end
159
+
160
+ end
@@ -0,0 +1,114 @@
1
+ require "set"
2
+
3
+ # Helpers to calculate the Jaccard Coefficient Index and related metrics easily.
4
+ #
5
+ # (from Wikipedia): The Jaccard coefficient measures similarity between sample sets, and is defined
6
+ # as the size of the intersection divided by the size of the union of the sample sets.
7
+ #
8
+ # The closer to 1.0 this number is, the more similar two items are.
9
+ module Jaccard
10
+ # Calculates the Jaccard Coefficient Index.
11
+ #
12
+ # +a+ must implement the set intersection and set union operators: <code>#&</code> and <code>#+</code>. Array and Set
13
+ # both implement these methods natively. It is expected that the results of <code>+</code> will either return a
14
+ # unique set or that it returns an object that responds to +#uniq!+. The results of +#coefficient+ will be
15
+ # wrong if the union contains duplicate elements.
16
+ #
17
+ # Also note that the individual items in +a+ and +b+ must implement a sane #eql? method.
18
+ # ActiveRecord::Base, String, Fixnum (but not Float), Array and Hash instances all implement
19
+ # a correct notion of equality. Other instances might have to be checked to ensure correct
20
+ # behavior.
21
+ #
22
+ # @param [#&, #+] a A set of items
23
+ # @param [#&, #+] b A second set of items
24
+ #
25
+ # @return [Float] The Jaccard Coefficient Index between +a+ and +b+.
26
+ #
27
+ # @example
28
+ #
29
+ # a = [1, 2, 3, 4]
30
+ # b = [1, 3, 4]
31
+ # Jaccard.coefficient(a, b) #=> 0.75
32
+ #
33
+ # @see http://en.wikipedia.org/wiki/Jaccard_index Jaccard Coefficient Index on Wikipedia.
34
+ def self.coefficient(a, b)
35
+ raise ArgumentError, "#{a.inspect} does not implement #&" unless a.respond_to?(:&)
36
+ raise ArgumentError, "#{a.inspect} does not implement #+" unless a.respond_to?(:+)
37
+
38
+ intersection = a & b
39
+ union = a + b
40
+
41
+ # Set does not implement #uniq or #uniq! since elements are
42
+ # always guaranteed to be present only once. That's the only
43
+ # reason we need to guard against that here.
44
+ union.uniq! if union.respond_to?(:uniq!)
45
+
46
+ intersection.length.to_f / union.length.to_f
47
+ end
48
+
49
+ # Calculates the inverse of the Jaccard coefficient.
50
+ #
51
+ # The closer to 0.0 the distance is, the more similar two items are.
52
+ #
53
+ # @return [Float] <code>1.0 - #coefficient(a, b)</code>
54
+ #
55
+ # @see Jaccard#coefficient for parameter calling convention and caveats about Array vs Set vs other object types.
56
+ def self.distance(a, b)
57
+ 1.0 - coefficient(a, b)
58
+ end
59
+
60
+ # Determines which member of +others+ has the smallest distance vs +a+.
61
+ #
62
+ # Because of the implementation, if multiple items from +others+ have
63
+ # the same distance, the last one will be returned. If this is undesirable,
64
+ # reverse +others+ before calling #closest_to.
65
+ #
66
+ # @param [#&, #+] a A set of attributes
67
+ # @param [#inject] others A collection of set of attributes
68
+ #
69
+ # @return The item from +others+ with the distance minimized to 0.0.
70
+ #
71
+ # @example
72
+ #
73
+ # a = [1, 2, 3]
74
+ # b = [1, 3]
75
+ # c = [1, 2, 3]
76
+ # Jaccard.closest_to(b, [a, c]) #=> [1, 2, 3]
77
+ # # Note that the actual instance returned will be c
78
+ def self.closest_to(a, others)
79
+ others.inject([2.0, nil]) do |memo, other|
80
+ dist = distance(a, other)
81
+ next memo if memo.first < dist
82
+
83
+ [dist, other]
84
+ end.last
85
+ end
86
+
87
+ # Returns the pair of items whose distance is minimized.
88
+ #
89
+ # @param [#each] items A collection of attributes.
90
+ #
91
+ # @return [Array<a, b>] A pair of set of attributes whose Jaccard distance is the minimal, given the input set.
92
+ #
93
+ # @example
94
+ #
95
+ # a = [1, 2, 3]
96
+ # b = [1, 2]
97
+ # c = [1, 3]
98
+ # Jaccard.best_match([a, b, c]) #=> [[1, 2, 3], [1, 2]]
99
+ def self.best_match(items)
100
+ seen = Set.new
101
+ matches = []
102
+
103
+ items.each do |row|
104
+ items.each do |col|
105
+ next if row == col
106
+ next if seen.include?([row, col]) || seen.include?([col, row])
107
+ seen << [row, col]
108
+ matches << [distance(row, col), [row, col]]
109
+ end
110
+ end
111
+
112
+ matches.sort.first.last
113
+ end
114
+ end