crosslanguagespotter 0.0.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/Gemfile +3 -0
- data/Rakefile +13 -0
- data/crosslanguagespotter.gemspec +36 -0
- data/examples/ex1.rb +13 -0
- data/examples/services_example.rb +13 -0
- data/lib/crosslanguagespotter/basic.rb +157 -0
- data/lib/crosslanguagespotter/context.rb +139 -0
- data/lib/crosslanguagespotter/figures_evaluator.rb +160 -0
- data/lib/crosslanguagespotter/jaccard.rb +114 -0
- data/lib/crosslanguagespotter/methods/context.rb +127 -0
- data/lib/crosslanguagespotter/methods/jaro.rb +118 -0
- data/lib/crosslanguagespotter/methods/tversky.rb +44 -0
- data/lib/crosslanguagespotter/model_loading.rb +333 -0
- data/lib/crosslanguagespotter/oracle.rb +261 -0
- data/lib/crosslanguagespotter/report.rb +88 -0
- data/lib/crosslanguagespotter/version.rb +5 -0
- data/lib/crosslanguagespotter/wekaintegration.rb +83 -0
- data/lib/crosslanguagespotter.rb +7 -0
- data/lib/jars/weka.jar +0 -0
- data/resources/css/bootstrap-theme.css +346 -0
- data/resources/css/bootstrap-theme.min.css +7 -0
- data/resources/css/bootstrap.css +5780 -0
- data/resources/css/bootstrap.min.css +7 -0
- data/resources/css/highlightstyles/arta.css +160 -0
- data/resources/css/highlightstyles/ascetic.css +50 -0
- data/resources/css/highlightstyles/atelier-dune.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-dune.light.css +93 -0
- data/resources/css/highlightstyles/atelier-forest.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-forest.light.css +93 -0
- data/resources/css/highlightstyles/atelier-heath.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-heath.light.css +93 -0
- data/resources/css/highlightstyles/atelier-lakeside.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-lakeside.light.css +93 -0
- data/resources/css/highlightstyles/atelier-seaside.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-seaside.light.css +93 -0
- data/resources/css/highlightstyles/brown_paper.css +105 -0
- data/resources/css/highlightstyles/brown_papersq.png +0 -0
- data/resources/css/highlightstyles/dark.css +105 -0
- data/resources/css/highlightstyles/default.css +153 -0
- data/resources/css/highlightstyles/docco.css +132 -0
- data/resources/css/highlightstyles/far.css +113 -0
- data/resources/css/highlightstyles/foundation.css +133 -0
- data/resources/css/highlightstyles/github.css +125 -0
- data/resources/css/highlightstyles/googlecode.css +147 -0
- data/resources/css/highlightstyles/idea.css +122 -0
- data/resources/css/highlightstyles/ir_black.css +105 -0
- data/resources/css/highlightstyles/magula.css +123 -0
- data/resources/css/highlightstyles/mono-blue.css +62 -0
- data/resources/css/highlightstyles/monokai.css +127 -0
- data/resources/css/highlightstyles/monokai_sublime.css +149 -0
- data/resources/css/highlightstyles/obsidian.css +154 -0
- data/resources/css/highlightstyles/paraiso.dark.css +93 -0
- data/resources/css/highlightstyles/paraiso.light.css +93 -0
- data/resources/css/highlightstyles/pojoaque.css +106 -0
- data/resources/css/highlightstyles/pojoaque.jpg +0 -0
- data/resources/css/highlightstyles/railscasts.css +182 -0
- data/resources/css/highlightstyles/rainbow.css +112 -0
- data/resources/css/highlightstyles/school_book.css +113 -0
- data/resources/css/highlightstyles/school_book.png +0 -0
- data/resources/css/highlightstyles/solarized_dark.css +107 -0
- data/resources/css/highlightstyles/solarized_light.css +107 -0
- data/resources/css/highlightstyles/sunburst.css +160 -0
- data/resources/css/highlightstyles/tomorrow-night-blue.css +93 -0
- data/resources/css/highlightstyles/tomorrow-night-bright.css +92 -0
- data/resources/css/highlightstyles/tomorrow-night-eighties.css +92 -0
- data/resources/css/highlightstyles/tomorrow-night.css +93 -0
- data/resources/css/highlightstyles/tomorrow.css +90 -0
- data/resources/css/highlightstyles/vs.css +89 -0
- data/resources/css/highlightstyles/xcode.css +158 -0
- data/resources/css/highlightstyles/zenburn.css +117 -0
- data/resources/example.html +1501 -0
- data/resources/js/bootstrap.js +1943 -0
- data/resources/js/bootstrap.min.js +7 -0
- data/resources/js/highlight.pack.js +1 -0
- data/resources/services_example.html +141 -0
- data/resources/template.html +61 -0
- data/test/data/angular-puzzle.GS +111 -0
- data/test/data/angular_puzzle/app.js +66 -0
- data/test/data/angular_puzzle/index.html +67 -0
- data/test/data/angular_puzzle/slidingPuzzle.js +203 -0
- data/test/data/angular_puzzle/wordSearchPuzzle.js +270 -0
- data/test/data/example.html +5 -0
- data/test/data/example.js +4 -0
- data/test/data/services/index.html +33 -0
- data/test/data/services/script.js +15 -0
- data/test/test_helper.rb +9 -0
- data/test/test_parsing.rb +23 -0
- data/test/test_spotter.rb +42 -0
- data/test/test_wekaintegration.rb +43 -0
- metadata +328 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
require 'set'
|
|
2
|
+
require 'crosslanguagespotter/figures_evaluator'
|
|
3
|
+
|
|
4
|
+
module CrossLanguageSpotter
|
|
5
|
+
|
|
6
|
+
class Pair
|
|
7
|
+
|
|
8
|
+
def initialize(a,b)
|
|
9
|
+
raise "error" unless a.source.position(:absolute)
|
|
10
|
+
raise "error" unless b.source.position(:absolute)
|
|
11
|
+
if b.source.artifact(:absolute).filename < a.source.artifact(:absolute).filename
|
|
12
|
+
@nodes = [b,a]
|
|
13
|
+
else
|
|
14
|
+
@nodes = [a,b]
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def first
|
|
19
|
+
nodes[0]
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def second
|
|
23
|
+
nodes[1]
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def nodes
|
|
27
|
+
@nodes
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def eql?(other)
|
|
31
|
+
return false unless other.is_a?(Pair)
|
|
32
|
+
self.nodes[0]==other.nodes[0] && self.nodes[1]==other.nodes[1]
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def ==(other)
|
|
36
|
+
self.eql?(other)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def to_s
|
|
40
|
+
"[#{nodes[0]} <-> #{nodes[1]}]"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def hash
|
|
44
|
+
nodes[0].hash*3+nodes[1].hash
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
class PointsMap
|
|
50
|
+
|
|
51
|
+
def initialize(alpha)
|
|
52
|
+
@alpha = alpha
|
|
53
|
+
@points = Hash.new {|h,k| h[k]=0.0}
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def points(pair)
|
|
57
|
+
@points[pair]
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def register_context_contribute(pair,value)
|
|
61
|
+
@points[pair] += 0.2*@alpha*value.to_f
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def register_child_contribute(pair)
|
|
65
|
+
@points[pair] += 0.1
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def each(threshold, &block)
|
|
69
|
+
@points.select{|k,v| v>=threshold}.each(&block)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
class ContextReferencesProducer
|
|
75
|
+
|
|
76
|
+
attr_accessor :verbose
|
|
77
|
+
|
|
78
|
+
def initialize(parameters)
|
|
79
|
+
@threshold = parameters[:threshold]
|
|
80
|
+
@verbose = parameters[:verbose]
|
|
81
|
+
@alpha = parameters[:alpha]
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def points_map(project)
|
|
85
|
+
# fill points map
|
|
86
|
+
points_map = PointsMap.new(@alpha)
|
|
87
|
+
block1 = Proc.new do |ni,nj|
|
|
88
|
+
context_ni = context(ni)
|
|
89
|
+
context_nj = context(nj)
|
|
90
|
+
shared_ctx = context_nj.intersection(context_ni).to_a
|
|
91
|
+
shared_ctx.each do |shared_ctx_entry|
|
|
92
|
+
v = shared_ctx_entry[:value]
|
|
93
|
+
context_ni.declarators_per_value(v).each do |di|
|
|
94
|
+
context_nj.declarators_per_value(v).each do |dj|
|
|
95
|
+
points_map.register_child_contribute(Pair.new(di,dj))
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
points_map.register_context_contribute(Pair.new(ni,nj),shared_ctx.count)
|
|
100
|
+
end
|
|
101
|
+
project.iter_over_shared_ids_instances {|ni,nj| block1.call(ni,nj) }
|
|
102
|
+
points_map
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# It should produce a set of node ids
|
|
106
|
+
def produce_set(project)
|
|
107
|
+
set = Set.new
|
|
108
|
+
puts "Context method:" if @verbose
|
|
109
|
+
|
|
110
|
+
points_map = points_map(project)
|
|
111
|
+
|
|
112
|
+
# look into points map
|
|
113
|
+
points_map.each(@threshold).each do |pair,value|
|
|
114
|
+
f = pair.first
|
|
115
|
+
s = pair.second
|
|
116
|
+
id_i = NodeId.from_node(f)
|
|
117
|
+
id_j = NodeId.from_node(s)
|
|
118
|
+
set << CrossLanguageRelation.new([id_i,id_j])
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
puts "Context method, set produced: #{set.count} elements" if @verbose
|
|
122
|
+
set
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
end
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
require 'set'
|
|
2
|
+
require 'crosslanguagespotter/figures_evaluator'
|
|
3
|
+
|
|
4
|
+
class Array
|
|
5
|
+
# select array items with index
|
|
6
|
+
# give a block both the item with index of array
|
|
7
|
+
# filtered by a select statement
|
|
8
|
+
def select_with_index
|
|
9
|
+
index = -1
|
|
10
|
+
select { |x| index += 1; yield(x, index) }
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# return indices array of array item
|
|
14
|
+
# example all indices of a in string "aaabaaabba"
|
|
15
|
+
def aindices(o)
|
|
16
|
+
out = Array.new
|
|
17
|
+
select_with_index { |x, i|
|
|
18
|
+
out << i if x == o }
|
|
19
|
+
out
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
module CrossLanguageSpotter
|
|
24
|
+
|
|
25
|
+
class JaroReferencesProducer
|
|
26
|
+
|
|
27
|
+
attr_accessor :verbose
|
|
28
|
+
|
|
29
|
+
def initialize(parameters)
|
|
30
|
+
@threshold = parameters[:threshold]
|
|
31
|
+
@verbose = parameters[:verbose]
|
|
32
|
+
@winkleradjust = parameters[:winkleradjust]
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# It should produce a set of node ids
|
|
36
|
+
def produce_set(project)
|
|
37
|
+
set = Set.new
|
|
38
|
+
puts "Jaro method:" if @verbose
|
|
39
|
+
|
|
40
|
+
block1 = Proc.new do |ni,nj|
|
|
41
|
+
context_ni = context(ni).sequence_of_values.map{|v| v.to_s}
|
|
42
|
+
context_nj = context(nj).sequence_of_values.map{|v| v.to_s}
|
|
43
|
+
if jaro_coefficient(context_ni,context_nj)>@threshold
|
|
44
|
+
id_i = NodeId.from_node(ni)
|
|
45
|
+
id_j = NodeId.from_node(nj)
|
|
46
|
+
set << CrossLanguageRelation.new([id_i,id_j])
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
project.iter_over_shared_ids_instances {|ni,nj| block1.call(ni,nj) }
|
|
50
|
+
puts "Jaro method, set produced: #{set.count} elements" if @verbose
|
|
51
|
+
set
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def jaro_coefficient_from_nodes(ni,nj)
|
|
55
|
+
jaro_coefficient_from_context(context(ni),context(nj))
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def jaro_coefficient_from_context(context_ni,context_nj)
|
|
59
|
+
s1 = context_ni.sequence_of_values.map{|v| v.to_s}
|
|
60
|
+
s2 = context_nj.sequence_of_values.map{|v| v.to_s}
|
|
61
|
+
jaro_coefficient(s1,s2)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def jaro_coefficient(s1,s2)
|
|
65
|
+
# if strings (without trailing & leadning spaces) are equal - return 1
|
|
66
|
+
#return 1 if str1.strip==str2.strip
|
|
67
|
+
# either string blank - return 0
|
|
68
|
+
#return 0 if str1.size==0 or str2.size==0
|
|
69
|
+
m = 0 # number of matching chars
|
|
70
|
+
tr = 0 # number of transpositions
|
|
71
|
+
|
|
72
|
+
# get character array length
|
|
73
|
+
s1l = s1.length
|
|
74
|
+
s2l = s2.length
|
|
75
|
+
# str2 should be the longer string
|
|
76
|
+
if s1l > s2l
|
|
77
|
+
s1, s2 = s2, s1
|
|
78
|
+
end
|
|
79
|
+
# hash from all unique str2 chars + occurances
|
|
80
|
+
# example 'aba': hash={ a => 0, b => 0 } a: first occurance, b first occurance
|
|
81
|
+
# if the first a was visited: { a => 1, b => 0} a: second occuance, b second occurance
|
|
82
|
+
found = Hash[*s2.uniq.sort.collect {|v| [v,0]}.flatten]
|
|
83
|
+
# matching distance definition
|
|
84
|
+
md = (([s1l,s2l].max / 2) - 1).to_i
|
|
85
|
+
s1.each_with_index do |c,i|
|
|
86
|
+
# find number of matching chars
|
|
87
|
+
if !found[c].nil? # character exists in str2
|
|
88
|
+
# calculates distance between 2 matching characters compare with md
|
|
89
|
+
if !s2.aindices(c)[found[c]].nil?
|
|
90
|
+
x = (s2.aindices(c)[found[c]] - i).abs
|
|
91
|
+
if x <= md
|
|
92
|
+
found[c] += 1 # increase occurance of character
|
|
93
|
+
m += 1 # increase number of matching characters
|
|
94
|
+
# transpositions?
|
|
95
|
+
if (x != 0)
|
|
96
|
+
tr += 1
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
tr = (tr/2).to_i
|
|
103
|
+
# calc jaro-distance
|
|
104
|
+
third = 1.0/3
|
|
105
|
+
jd = (third * m / s1l) + (third * m / s2l) + (third * (m - tr) / m)
|
|
106
|
+
out = jd
|
|
107
|
+
# winkleradjust? if first l characters are the same
|
|
108
|
+
if @winkleradjust
|
|
109
|
+
l = 0
|
|
110
|
+
(0..s1l-1).each { |i| s1[i]==s2[i] ? l+=1 : break }
|
|
111
|
+
out = jd + (l * 0.1 * (1 - jd))
|
|
112
|
+
end
|
|
113
|
+
out
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
require 'set'
|
|
2
|
+
require 'crosslanguagespotter/jaccard'
|
|
3
|
+
require 'crosslanguagespotter/figures_evaluator'
|
|
4
|
+
|
|
5
|
+
module CrossLanguageSpotter
|
|
6
|
+
|
|
7
|
+
class TverskyReferencesProducer
|
|
8
|
+
|
|
9
|
+
attr_accessor :verbose
|
|
10
|
+
|
|
11
|
+
def initialize(parameters)
|
|
12
|
+
@threshold = parameters[:threshold]
|
|
13
|
+
@alpha = parameters[:alpha]
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# It should produce a set of node ids
|
|
17
|
+
def produce_set(project)
|
|
18
|
+
set = Set.new
|
|
19
|
+
puts "Tversky method:" if @verbose
|
|
20
|
+
block = Proc.new do |ni,nj|
|
|
21
|
+
context_ni = context(ni).values & project.shared_ids
|
|
22
|
+
context_nj = context(nj).values & project.shared_ids
|
|
23
|
+
j = tversky_coefficient(context_ni,context_nj)
|
|
24
|
+
if j>=@threshold
|
|
25
|
+
id_i = NodeId.from_node(ni)
|
|
26
|
+
id_j = NodeId.from_node(nj)
|
|
27
|
+
puts " * '#{id_i.file}':#{id_i.index} -> '#{id_j.file}':#{id_j.index}" if @verbose
|
|
28
|
+
set << CrossLanguageRelation.new([id_i,id_j])
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
project.iter_over_shared_ids_instances {|ni,nj| block.call(ni,nj) }
|
|
32
|
+
puts "Tversky method, set produced: #{set.count} elements" if @verbose
|
|
33
|
+
set
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def tversky_coefficient(context_ni,context_nj)
|
|
37
|
+
shared = context_ni & context_nj
|
|
38
|
+
others = (context_ni.count-shared.count)+(context_nj.count-shared.count)
|
|
39
|
+
shared.count.to_f/(shared.count.to_f+@alpha*others.to_f)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
end
|
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
require "codemodels"
|
|
2
|
+
require "codemodels/html"
|
|
3
|
+
require "codemodels/js"
|
|
4
|
+
require "crosslanguagespotter/context"
|
|
5
|
+
|
|
6
|
+
module CrossLanguageSpotter
|
|
7
|
+
|
|
8
|
+
AngularParser = CodeModels::Html::AngularJs.parser_considering_angular_embedded_code
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def traverse_index(node)
|
|
12
|
+
root = node.root(:also_foreign)
|
|
13
|
+
i = 1
|
|
14
|
+
root.traverse(:also_foreign) do |n|
|
|
15
|
+
return i if (n==node) && (n.source.position(:absolute)==node.source.position(:absolute))
|
|
16
|
+
i+=1
|
|
17
|
+
end
|
|
18
|
+
raise "Error..."
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def node_at_traverse_index(root,index)
|
|
22
|
+
i = 1
|
|
23
|
+
root.traverse(:also_foreign) do |n|
|
|
24
|
+
return n if (i==index)
|
|
25
|
+
i+=1
|
|
26
|
+
end
|
|
27
|
+
raise "Error... traverse_index: #{index}. Reached #{i}"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def offset_referred_to_host(node)
|
|
32
|
+
base = node.eContainer ? offset_referred_to_host(node.eContainer) : 0
|
|
33
|
+
if node.eContainingFeature && node.eContainingFeature==:foreign_asts
|
|
34
|
+
base+node.eContainer.source.begin_pos.line-1
|
|
35
|
+
else
|
|
36
|
+
base
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def line_referred_to_host(node,line)
|
|
41
|
+
offset_referred_to_host(node)+line
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def host_lines(node)
|
|
45
|
+
[line_referred_to_host(node,node.source.begin_line),
|
|
46
|
+
line_referred_to_host(node,node.source.end_line)]
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def is_in_line?(node,line)
|
|
50
|
+
line>=line_referred_to_host(node,node.source.begin_pos.line) && line<=line_referred_to_host(node,node.source.end_pos.line)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
class ModelLoader
|
|
54
|
+
|
|
55
|
+
def initialize
|
|
56
|
+
@models = Hash.new do |h,k|
|
|
57
|
+
h[k] = load_model(k)
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def model(path)
|
|
62
|
+
@models[path]
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
private
|
|
66
|
+
|
|
67
|
+
def load_model(relpath)
|
|
68
|
+
complete_path =relpath
|
|
69
|
+
raise "Unexisting file #{complete_path}" unless File.exist?(complete_path)
|
|
70
|
+
if relpath.end_with?'.html'
|
|
71
|
+
model = AngularParser.parse_file(complete_path)
|
|
72
|
+
elsif relpath.end_with?'.js'
|
|
73
|
+
model = CodeModels.parse_file(complete_path)
|
|
74
|
+
else
|
|
75
|
+
raise "I don't know what to do with: #{complete_path}"
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
class SequentialAst
|
|
82
|
+
|
|
83
|
+
attr_reader :node
|
|
84
|
+
attr_reader :value
|
|
85
|
+
attr_reader :reference_labels
|
|
86
|
+
|
|
87
|
+
def initialize(node,value,reference_labels)
|
|
88
|
+
@node = node
|
|
89
|
+
@value = value
|
|
90
|
+
@reference_labels = reference_labels
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
class Project
|
|
96
|
+
|
|
97
|
+
def initialize(src,verbose=false)
|
|
98
|
+
@models = {}
|
|
99
|
+
@verbose = verbose
|
|
100
|
+
load_all_models(src)
|
|
101
|
+
@values_map_per_file = {}
|
|
102
|
+
@value_frequencies = Hash.new {|h,k| h[k]={} }
|
|
103
|
+
@idf = {}
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def models
|
|
107
|
+
@models
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def sequential_asts
|
|
111
|
+
sequential_asts = []
|
|
112
|
+
@nodes_per_file.each do |f,nodes|
|
|
113
|
+
nodes.each do |n|
|
|
114
|
+
attribute_values_of_n = n.collect_values_with_count.keys
|
|
115
|
+
context = context(n) if attribute_values_of_n.count > 0
|
|
116
|
+
attribute_values_of_n.each do |v|
|
|
117
|
+
sequential_asts << SequentialAst.new(n,v,context.values)
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
sequential_asts
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def shared_ids
|
|
125
|
+
@shared_ids = calc_shared_ids unless @shared_ids
|
|
126
|
+
@shared_ids
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def files
|
|
130
|
+
files = Set.new
|
|
131
|
+
@files_per_values.values.each do |fs|
|
|
132
|
+
fs.each {|f| files<<f}
|
|
133
|
+
end
|
|
134
|
+
files
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def iter_over_shared_ids_instances(&block)
|
|
138
|
+
shared_ids.each do |v|
|
|
139
|
+
iter_value_for_all_extensions(v,&block)
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def iter_over_shared_ids_instances_case_insensitive(&block)
|
|
144
|
+
shared_ids.each do |v|
|
|
145
|
+
iter_value_for_all_extensions_case_insensitive(v,&block)
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def iter_value_for_all_extensions(v,&block)
|
|
150
|
+
extensions = []
|
|
151
|
+
@files_per_values[v].each do |el|
|
|
152
|
+
ext = File.extname(el)
|
|
153
|
+
extensions << ext unless extensions.include?(ext)
|
|
154
|
+
end
|
|
155
|
+
for i in 0...extensions.count
|
|
156
|
+
ext_i = extensions[i]
|
|
157
|
+
for j in (i+1)...extensions.count
|
|
158
|
+
ext_j = extensions[j]
|
|
159
|
+
iter_value_in_extensions(v,ext_i,ext_j,&block)
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def iter_value_for_all_extensions_case_insensitive(v,&block)
|
|
165
|
+
extensions = []
|
|
166
|
+
@files_per_values[v].each do |el|
|
|
167
|
+
ext = File.extname(el)
|
|
168
|
+
extensions << ext unless extensions.include?(ext)
|
|
169
|
+
end
|
|
170
|
+
for i in 0...extensions.count
|
|
171
|
+
ext_i = extensions[i]
|
|
172
|
+
for j in (i+1)...extensions.count
|
|
173
|
+
ext_j = extensions[j]
|
|
174
|
+
iter_value_in_extensions_case_insensitive(v,ext_i,ext_j,&block)
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def iter_value_in_extensions(v,ext_i,ext_j,&block)
|
|
180
|
+
raise "Error" if ext_i==ext_j
|
|
181
|
+
files_i = []
|
|
182
|
+
files_j = []
|
|
183
|
+
vs = [v]
|
|
184
|
+
vs.each do |v_el|
|
|
185
|
+
@files_per_values[v_el].each do |el|
|
|
186
|
+
files_i << el if File.extname(el)==ext_i
|
|
187
|
+
files_j << el if File.extname(el)==ext_j
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
nodes_i = []
|
|
191
|
+
nodes_j = []
|
|
192
|
+
files_i.each do |f|
|
|
193
|
+
vs.each do |v_el|
|
|
194
|
+
@nodes_per_value_and_file_map[v_el][f].each {|n| nodes_i << n}
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
files_j.each do |f|
|
|
198
|
+
vs.each do |v_el|
|
|
199
|
+
@nodes_per_value_and_file_map[v_el][f].each {|n| nodes_j << n}
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
for ni in nodes_i
|
|
203
|
+
for nj in nodes_j
|
|
204
|
+
block.call(ni,nj)
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def iter_value_in_extensions_case_insensitive(v,ext_i,ext_j,&block)
|
|
210
|
+
raise "Error" if ext_i==ext_j
|
|
211
|
+
files_i = []
|
|
212
|
+
files_j = []
|
|
213
|
+
vs = values_case_insensitve(v)
|
|
214
|
+
vs.each do |v_el|
|
|
215
|
+
@files_per_values[v_el].each do |el|
|
|
216
|
+
files_i << el if File.extname(el)==ext_i
|
|
217
|
+
files_j << el if File.extname(el)==ext_j
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
nodes_i = []
|
|
221
|
+
nodes_j = []
|
|
222
|
+
files_i.each do |f|
|
|
223
|
+
vs.each do |v_el|
|
|
224
|
+
@nodes_per_value_and_file_map[v_el][f].each {|n| nodes_i << n}
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
files_j.each do |f|
|
|
228
|
+
vs.each do |v_el|
|
|
229
|
+
@nodes_per_value_and_file_map[v_el][f].each {|n| nodes_j << n}
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
for ni in nodes_i
|
|
233
|
+
for nj in nodes_j
|
|
234
|
+
block.call(ni,nj)
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
def tf_idf(file,value)
|
|
240
|
+
value_frequency(file,value)*idf(value)
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
def itf_idf(file,value)
|
|
244
|
+
itf(file,value)*idf(value)
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
private
|
|
248
|
+
|
|
249
|
+
def idf(value)
|
|
250
|
+
unless @idf[value]
|
|
251
|
+
pos = 0
|
|
252
|
+
neg = 0
|
|
253
|
+
files.each do |f|
|
|
254
|
+
values_per_file(f).has_key?(value) ? pos+=1 : neg+=1
|
|
255
|
+
end
|
|
256
|
+
@idf[value] = Math.log((pos+neg).to_f/pos.to_f)
|
|
257
|
+
end
|
|
258
|
+
@idf[value]
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
def itf(file,value)
|
|
262
|
+
Math.log(1.0/value_frequency(file,value))
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
def value_frequency(file,value)
|
|
266
|
+
unless @value_frequencies[file][value]
|
|
267
|
+
values_map = values_per_file(file)
|
|
268
|
+
total = values_map.values.inject(:+)
|
|
269
|
+
@value_frequencies[file][value] = values_map[value].to_f/total.to_f
|
|
270
|
+
end
|
|
271
|
+
@value_frequencies[file][value]
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
def values_per_file(file)
|
|
275
|
+
unless @values_map_per_file[file]
|
|
276
|
+
@values_map_per_file[file] = @ml.model(file).collect_values_with_count_subtree(:also_foreign)
|
|
277
|
+
end
|
|
278
|
+
@values_map_per_file[file]
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
def values_case_insensitve(v)
|
|
282
|
+
@files_per_values.keys.select {|el| el.to_s.downcase==v.to_s.downcase}
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def calc_shared_ids
|
|
286
|
+
shared = []
|
|
287
|
+
@files_per_values.each do |v,s|
|
|
288
|
+
extensions = []
|
|
289
|
+
s.each do |el|
|
|
290
|
+
ext = File.extname(el)
|
|
291
|
+
extensions << ext unless extensions.include?(ext)
|
|
292
|
+
end
|
|
293
|
+
if extensions.count>1
|
|
294
|
+
shared << v
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
shared
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
def load_all_models(src)
|
|
301
|
+
@ml = ModelLoader.new
|
|
302
|
+
@nodes_per_file = Hash.new {|h,k| h[k] = []}
|
|
303
|
+
@files_per_values = Hash.new {|h,k| h[k] = Set.new}
|
|
304
|
+
|
|
305
|
+
# nodes per value, file
|
|
306
|
+
@nodes_per_value_and_file_map = Hash.new {|h,k| h[k] = Hash.new {|h,k| h[k] = [] }}
|
|
307
|
+
|
|
308
|
+
Dir["#{src}/**/*.html"].each do |f|
|
|
309
|
+
puts "Loading model from #{f}" if @verbose
|
|
310
|
+
load_model_from_file(f)
|
|
311
|
+
end
|
|
312
|
+
Dir["#{src}/**/*.js"].each do |f|
|
|
313
|
+
puts "Loading model from #{f}" if @verbose
|
|
314
|
+
load_model_from_file(f)
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
def load_model_from_file(f)
|
|
319
|
+
m = @ml.model(f)
|
|
320
|
+
@models[f]=m
|
|
321
|
+
m.traverse_also_foreign do |n|
|
|
322
|
+
@nodes_per_file[f] << n
|
|
323
|
+
values = n.collect_values_with_count.keys
|
|
324
|
+
values.each do |v|
|
|
325
|
+
@files_per_values[v] << f
|
|
326
|
+
@nodes_per_value_and_file_map[v][f] << n
|
|
327
|
+
end
|
|
328
|
+
end
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
end
|