crosslanguagespotter 0.0.2-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/Gemfile +3 -0
- data/Rakefile +13 -0
- data/crosslanguagespotter.gemspec +36 -0
- data/examples/ex1.rb +13 -0
- data/examples/services_example.rb +13 -0
- data/lib/crosslanguagespotter/basic.rb +157 -0
- data/lib/crosslanguagespotter/context.rb +139 -0
- data/lib/crosslanguagespotter/figures_evaluator.rb +160 -0
- data/lib/crosslanguagespotter/jaccard.rb +114 -0
- data/lib/crosslanguagespotter/methods/context.rb +127 -0
- data/lib/crosslanguagespotter/methods/jaro.rb +118 -0
- data/lib/crosslanguagespotter/methods/tversky.rb +44 -0
- data/lib/crosslanguagespotter/model_loading.rb +333 -0
- data/lib/crosslanguagespotter/oracle.rb +261 -0
- data/lib/crosslanguagespotter/report.rb +88 -0
- data/lib/crosslanguagespotter/version.rb +5 -0
- data/lib/crosslanguagespotter/wekaintegration.rb +83 -0
- data/lib/crosslanguagespotter.rb +7 -0
- data/lib/jars/weka.jar +0 -0
- data/resources/css/bootstrap-theme.css +346 -0
- data/resources/css/bootstrap-theme.min.css +7 -0
- data/resources/css/bootstrap.css +5780 -0
- data/resources/css/bootstrap.min.css +7 -0
- data/resources/css/highlightstyles/arta.css +160 -0
- data/resources/css/highlightstyles/ascetic.css +50 -0
- data/resources/css/highlightstyles/atelier-dune.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-dune.light.css +93 -0
- data/resources/css/highlightstyles/atelier-forest.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-forest.light.css +93 -0
- data/resources/css/highlightstyles/atelier-heath.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-heath.light.css +93 -0
- data/resources/css/highlightstyles/atelier-lakeside.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-lakeside.light.css +93 -0
- data/resources/css/highlightstyles/atelier-seaside.dark.css +93 -0
- data/resources/css/highlightstyles/atelier-seaside.light.css +93 -0
- data/resources/css/highlightstyles/brown_paper.css +105 -0
- data/resources/css/highlightstyles/brown_papersq.png +0 -0
- data/resources/css/highlightstyles/dark.css +105 -0
- data/resources/css/highlightstyles/default.css +153 -0
- data/resources/css/highlightstyles/docco.css +132 -0
- data/resources/css/highlightstyles/far.css +113 -0
- data/resources/css/highlightstyles/foundation.css +133 -0
- data/resources/css/highlightstyles/github.css +125 -0
- data/resources/css/highlightstyles/googlecode.css +147 -0
- data/resources/css/highlightstyles/idea.css +122 -0
- data/resources/css/highlightstyles/ir_black.css +105 -0
- data/resources/css/highlightstyles/magula.css +123 -0
- data/resources/css/highlightstyles/mono-blue.css +62 -0
- data/resources/css/highlightstyles/monokai.css +127 -0
- data/resources/css/highlightstyles/monokai_sublime.css +149 -0
- data/resources/css/highlightstyles/obsidian.css +154 -0
- data/resources/css/highlightstyles/paraiso.dark.css +93 -0
- data/resources/css/highlightstyles/paraiso.light.css +93 -0
- data/resources/css/highlightstyles/pojoaque.css +106 -0
- data/resources/css/highlightstyles/pojoaque.jpg +0 -0
- data/resources/css/highlightstyles/railscasts.css +182 -0
- data/resources/css/highlightstyles/rainbow.css +112 -0
- data/resources/css/highlightstyles/school_book.css +113 -0
- data/resources/css/highlightstyles/school_book.png +0 -0
- data/resources/css/highlightstyles/solarized_dark.css +107 -0
- data/resources/css/highlightstyles/solarized_light.css +107 -0
- data/resources/css/highlightstyles/sunburst.css +160 -0
- data/resources/css/highlightstyles/tomorrow-night-blue.css +93 -0
- data/resources/css/highlightstyles/tomorrow-night-bright.css +92 -0
- data/resources/css/highlightstyles/tomorrow-night-eighties.css +92 -0
- data/resources/css/highlightstyles/tomorrow-night.css +93 -0
- data/resources/css/highlightstyles/tomorrow.css +90 -0
- data/resources/css/highlightstyles/vs.css +89 -0
- data/resources/css/highlightstyles/xcode.css +158 -0
- data/resources/css/highlightstyles/zenburn.css +117 -0
- data/resources/example.html +1501 -0
- data/resources/js/bootstrap.js +1943 -0
- data/resources/js/bootstrap.min.js +7 -0
- data/resources/js/highlight.pack.js +1 -0
- data/resources/services_example.html +141 -0
- data/resources/template.html +61 -0
- data/test/data/angular-puzzle.GS +111 -0
- data/test/data/angular_puzzle/app.js +66 -0
- data/test/data/angular_puzzle/index.html +67 -0
- data/test/data/angular_puzzle/slidingPuzzle.js +203 -0
- data/test/data/angular_puzzle/wordSearchPuzzle.js +270 -0
- data/test/data/example.html +5 -0
- data/test/data/example.js +4 -0
- data/test/data/services/index.html +33 -0
- data/test/data/services/script.js +15 -0
- data/test/test_helper.rb +9 -0
- data/test/test_parsing.rb +23 -0
- data/test/test_spotter.rb +42 -0
- data/test/test_wekaintegration.rb +43 -0
- metadata +328 -0
@@ -0,0 +1,127 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'crosslanguagespotter/figures_evaluator'
|
3
|
+
|
4
|
+
module CrossLanguageSpotter
|
5
|
+
|
6
|
+
class Pair
|
7
|
+
|
8
|
+
def initialize(a,b)
|
9
|
+
raise "error" unless a.source.position(:absolute)
|
10
|
+
raise "error" unless b.source.position(:absolute)
|
11
|
+
if b.source.artifact(:absolute).filename < a.source.artifact(:absolute).filename
|
12
|
+
@nodes = [b,a]
|
13
|
+
else
|
14
|
+
@nodes = [a,b]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def first
|
19
|
+
nodes[0]
|
20
|
+
end
|
21
|
+
|
22
|
+
def second
|
23
|
+
nodes[1]
|
24
|
+
end
|
25
|
+
|
26
|
+
def nodes
|
27
|
+
@nodes
|
28
|
+
end
|
29
|
+
|
30
|
+
def eql?(other)
|
31
|
+
return false unless other.is_a?(Pair)
|
32
|
+
self.nodes[0]==other.nodes[0] && self.nodes[1]==other.nodes[1]
|
33
|
+
end
|
34
|
+
|
35
|
+
def ==(other)
|
36
|
+
self.eql?(other)
|
37
|
+
end
|
38
|
+
|
39
|
+
def to_s
|
40
|
+
"[#{nodes[0]} <-> #{nodes[1]}]"
|
41
|
+
end
|
42
|
+
|
43
|
+
def hash
|
44
|
+
nodes[0].hash*3+nodes[1].hash
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
class PointsMap
|
50
|
+
|
51
|
+
def initialize(alpha)
|
52
|
+
@alpha = alpha
|
53
|
+
@points = Hash.new {|h,k| h[k]=0.0}
|
54
|
+
end
|
55
|
+
|
56
|
+
def points(pair)
|
57
|
+
@points[pair]
|
58
|
+
end
|
59
|
+
|
60
|
+
def register_context_contribute(pair,value)
|
61
|
+
@points[pair] += 0.2*@alpha*value.to_f
|
62
|
+
end
|
63
|
+
|
64
|
+
def register_child_contribute(pair)
|
65
|
+
@points[pair] += 0.1
|
66
|
+
end
|
67
|
+
|
68
|
+
def each(threshold, &block)
|
69
|
+
@points.select{|k,v| v>=threshold}.each(&block)
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
class ContextReferencesProducer
|
75
|
+
|
76
|
+
attr_accessor :verbose
|
77
|
+
|
78
|
+
def initialize(parameters)
|
79
|
+
@threshold = parameters[:threshold]
|
80
|
+
@verbose = parameters[:verbose]
|
81
|
+
@alpha = parameters[:alpha]
|
82
|
+
end
|
83
|
+
|
84
|
+
def points_map(project)
|
85
|
+
# fill points map
|
86
|
+
points_map = PointsMap.new(@alpha)
|
87
|
+
block1 = Proc.new do |ni,nj|
|
88
|
+
context_ni = context(ni)
|
89
|
+
context_nj = context(nj)
|
90
|
+
shared_ctx = context_nj.intersection(context_ni).to_a
|
91
|
+
shared_ctx.each do |shared_ctx_entry|
|
92
|
+
v = shared_ctx_entry[:value]
|
93
|
+
context_ni.declarators_per_value(v).each do |di|
|
94
|
+
context_nj.declarators_per_value(v).each do |dj|
|
95
|
+
points_map.register_child_contribute(Pair.new(di,dj))
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
points_map.register_context_contribute(Pair.new(ni,nj),shared_ctx.count)
|
100
|
+
end
|
101
|
+
project.iter_over_shared_ids_instances {|ni,nj| block1.call(ni,nj) }
|
102
|
+
points_map
|
103
|
+
end
|
104
|
+
|
105
|
+
# It should produce a set of node ids
|
106
|
+
def produce_set(project)
|
107
|
+
set = Set.new
|
108
|
+
puts "Context method:" if @verbose
|
109
|
+
|
110
|
+
points_map = points_map(project)
|
111
|
+
|
112
|
+
# look into points map
|
113
|
+
points_map.each(@threshold).each do |pair,value|
|
114
|
+
f = pair.first
|
115
|
+
s = pair.second
|
116
|
+
id_i = NodeId.from_node(f)
|
117
|
+
id_j = NodeId.from_node(s)
|
118
|
+
set << CrossLanguageRelation.new([id_i,id_j])
|
119
|
+
end
|
120
|
+
|
121
|
+
puts "Context method, set produced: #{set.count} elements" if @verbose
|
122
|
+
set
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'crosslanguagespotter/figures_evaluator'
|
3
|
+
|
4
|
+
class Array
|
5
|
+
# select array items with index
|
6
|
+
# give a block both the item with index of array
|
7
|
+
# filtered by a select statement
|
8
|
+
def select_with_index
|
9
|
+
index = -1
|
10
|
+
select { |x| index += 1; yield(x, index) }
|
11
|
+
end
|
12
|
+
|
13
|
+
# return indices array of array item
|
14
|
+
# example all indices of a in string "aaabaaabba"
|
15
|
+
def aindices(o)
|
16
|
+
out = Array.new
|
17
|
+
select_with_index { |x, i|
|
18
|
+
out << i if x == o }
|
19
|
+
out
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
module CrossLanguageSpotter
|
24
|
+
|
25
|
+
class JaroReferencesProducer
|
26
|
+
|
27
|
+
attr_accessor :verbose
|
28
|
+
|
29
|
+
def initialize(parameters)
|
30
|
+
@threshold = parameters[:threshold]
|
31
|
+
@verbose = parameters[:verbose]
|
32
|
+
@winkleradjust = parameters[:winkleradjust]
|
33
|
+
end
|
34
|
+
|
35
|
+
# It should produce a set of node ids
|
36
|
+
def produce_set(project)
|
37
|
+
set = Set.new
|
38
|
+
puts "Jaro method:" if @verbose
|
39
|
+
|
40
|
+
block1 = Proc.new do |ni,nj|
|
41
|
+
context_ni = context(ni).sequence_of_values.map{|v| v.to_s}
|
42
|
+
context_nj = context(nj).sequence_of_values.map{|v| v.to_s}
|
43
|
+
if jaro_coefficient(context_ni,context_nj)>@threshold
|
44
|
+
id_i = NodeId.from_node(ni)
|
45
|
+
id_j = NodeId.from_node(nj)
|
46
|
+
set << CrossLanguageRelation.new([id_i,id_j])
|
47
|
+
end
|
48
|
+
end
|
49
|
+
project.iter_over_shared_ids_instances {|ni,nj| block1.call(ni,nj) }
|
50
|
+
puts "Jaro method, set produced: #{set.count} elements" if @verbose
|
51
|
+
set
|
52
|
+
end
|
53
|
+
|
54
|
+
def jaro_coefficient_from_nodes(ni,nj)
|
55
|
+
jaro_coefficient_from_context(context(ni),context(nj))
|
56
|
+
end
|
57
|
+
|
58
|
+
def jaro_coefficient_from_context(context_ni,context_nj)
|
59
|
+
s1 = context_ni.sequence_of_values.map{|v| v.to_s}
|
60
|
+
s2 = context_nj.sequence_of_values.map{|v| v.to_s}
|
61
|
+
jaro_coefficient(s1,s2)
|
62
|
+
end
|
63
|
+
|
64
|
+
def jaro_coefficient(s1,s2)
|
65
|
+
# if strings (without trailing & leadning spaces) are equal - return 1
|
66
|
+
#return 1 if str1.strip==str2.strip
|
67
|
+
# either string blank - return 0
|
68
|
+
#return 0 if str1.size==0 or str2.size==0
|
69
|
+
m = 0 # number of matching chars
|
70
|
+
tr = 0 # number of transpositions
|
71
|
+
|
72
|
+
# get character array length
|
73
|
+
s1l = s1.length
|
74
|
+
s2l = s2.length
|
75
|
+
# str2 should be the longer string
|
76
|
+
if s1l > s2l
|
77
|
+
s1, s2 = s2, s1
|
78
|
+
end
|
79
|
+
# hash from all unique str2 chars + occurances
|
80
|
+
# example 'aba': hash={ a => 0, b => 0 } a: first occurance, b first occurance
|
81
|
+
# if the first a was visited: { a => 1, b => 0} a: second occuance, b second occurance
|
82
|
+
found = Hash[*s2.uniq.sort.collect {|v| [v,0]}.flatten]
|
83
|
+
# matching distance definition
|
84
|
+
md = (([s1l,s2l].max / 2) - 1).to_i
|
85
|
+
s1.each_with_index do |c,i|
|
86
|
+
# find number of matching chars
|
87
|
+
if !found[c].nil? # character exists in str2
|
88
|
+
# calculates distance between 2 matching characters compare with md
|
89
|
+
if !s2.aindices(c)[found[c]].nil?
|
90
|
+
x = (s2.aindices(c)[found[c]] - i).abs
|
91
|
+
if x <= md
|
92
|
+
found[c] += 1 # increase occurance of character
|
93
|
+
m += 1 # increase number of matching characters
|
94
|
+
# transpositions?
|
95
|
+
if (x != 0)
|
96
|
+
tr += 1
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
tr = (tr/2).to_i
|
103
|
+
# calc jaro-distance
|
104
|
+
third = 1.0/3
|
105
|
+
jd = (third * m / s1l) + (third * m / s2l) + (third * (m - tr) / m)
|
106
|
+
out = jd
|
107
|
+
# winkleradjust? if first l characters are the same
|
108
|
+
if @winkleradjust
|
109
|
+
l = 0
|
110
|
+
(0..s1l-1).each { |i| s1[i]==s2[i] ? l+=1 : break }
|
111
|
+
out = jd + (l * 0.1 * (1 - jd))
|
112
|
+
end
|
113
|
+
out
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'crosslanguagespotter/jaccard'
|
3
|
+
require 'crosslanguagespotter/figures_evaluator'
|
4
|
+
|
5
|
+
module CrossLanguageSpotter
|
6
|
+
|
7
|
+
class TverskyReferencesProducer
|
8
|
+
|
9
|
+
attr_accessor :verbose
|
10
|
+
|
11
|
+
def initialize(parameters)
|
12
|
+
@threshold = parameters[:threshold]
|
13
|
+
@alpha = parameters[:alpha]
|
14
|
+
end
|
15
|
+
|
16
|
+
# It should produce a set of node ids
|
17
|
+
def produce_set(project)
|
18
|
+
set = Set.new
|
19
|
+
puts "Tversky method:" if @verbose
|
20
|
+
block = Proc.new do |ni,nj|
|
21
|
+
context_ni = context(ni).values & project.shared_ids
|
22
|
+
context_nj = context(nj).values & project.shared_ids
|
23
|
+
j = tversky_coefficient(context_ni,context_nj)
|
24
|
+
if j>=@threshold
|
25
|
+
id_i = NodeId.from_node(ni)
|
26
|
+
id_j = NodeId.from_node(nj)
|
27
|
+
puts " * '#{id_i.file}':#{id_i.index} -> '#{id_j.file}':#{id_j.index}" if @verbose
|
28
|
+
set << CrossLanguageRelation.new([id_i,id_j])
|
29
|
+
end
|
30
|
+
end
|
31
|
+
project.iter_over_shared_ids_instances {|ni,nj| block.call(ni,nj) }
|
32
|
+
puts "Tversky method, set produced: #{set.count} elements" if @verbose
|
33
|
+
set
|
34
|
+
end
|
35
|
+
|
36
|
+
def tversky_coefficient(context_ni,context_nj)
|
37
|
+
shared = context_ni & context_nj
|
38
|
+
others = (context_ni.count-shared.count)+(context_nj.count-shared.count)
|
39
|
+
shared.count.to_f/(shared.count.to_f+@alpha*others.to_f)
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1,333 @@
|
|
1
|
+
require "codemodels"
|
2
|
+
require "codemodels/html"
|
3
|
+
require "codemodels/js"
|
4
|
+
require "crosslanguagespotter/context"
|
5
|
+
|
6
|
+
module CrossLanguageSpotter
|
7
|
+
|
8
|
+
AngularParser = CodeModels::Html::AngularJs.parser_considering_angular_embedded_code
|
9
|
+
|
10
|
+
|
11
|
+
def traverse_index(node)
|
12
|
+
root = node.root(:also_foreign)
|
13
|
+
i = 1
|
14
|
+
root.traverse(:also_foreign) do |n|
|
15
|
+
return i if (n==node) && (n.source.position(:absolute)==node.source.position(:absolute))
|
16
|
+
i+=1
|
17
|
+
end
|
18
|
+
raise "Error..."
|
19
|
+
end
|
20
|
+
|
21
|
+
def node_at_traverse_index(root,index)
|
22
|
+
i = 1
|
23
|
+
root.traverse(:also_foreign) do |n|
|
24
|
+
return n if (i==index)
|
25
|
+
i+=1
|
26
|
+
end
|
27
|
+
raise "Error... traverse_index: #{index}. Reached #{i}"
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
def offset_referred_to_host(node)
|
32
|
+
base = node.eContainer ? offset_referred_to_host(node.eContainer) : 0
|
33
|
+
if node.eContainingFeature && node.eContainingFeature==:foreign_asts
|
34
|
+
base+node.eContainer.source.begin_pos.line-1
|
35
|
+
else
|
36
|
+
base
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def line_referred_to_host(node,line)
|
41
|
+
offset_referred_to_host(node)+line
|
42
|
+
end
|
43
|
+
|
44
|
+
def host_lines(node)
|
45
|
+
[line_referred_to_host(node,node.source.begin_line),
|
46
|
+
line_referred_to_host(node,node.source.end_line)]
|
47
|
+
end
|
48
|
+
|
49
|
+
def is_in_line?(node,line)
|
50
|
+
line>=line_referred_to_host(node,node.source.begin_pos.line) && line<=line_referred_to_host(node,node.source.end_pos.line)
|
51
|
+
end
|
52
|
+
|
53
|
+
class ModelLoader
|
54
|
+
|
55
|
+
def initialize
|
56
|
+
@models = Hash.new do |h,k|
|
57
|
+
h[k] = load_model(k)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def model(path)
|
62
|
+
@models[path]
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def load_model(relpath)
|
68
|
+
complete_path =relpath
|
69
|
+
raise "Unexisting file #{complete_path}" unless File.exist?(complete_path)
|
70
|
+
if relpath.end_with?'.html'
|
71
|
+
model = AngularParser.parse_file(complete_path)
|
72
|
+
elsif relpath.end_with?'.js'
|
73
|
+
model = CodeModels.parse_file(complete_path)
|
74
|
+
else
|
75
|
+
raise "I don't know what to do with: #{complete_path}"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
class SequentialAst
|
82
|
+
|
83
|
+
attr_reader :node
|
84
|
+
attr_reader :value
|
85
|
+
attr_reader :reference_labels
|
86
|
+
|
87
|
+
def initialize(node,value,reference_labels)
|
88
|
+
@node = node
|
89
|
+
@value = value
|
90
|
+
@reference_labels = reference_labels
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
class Project
|
96
|
+
|
97
|
+
def initialize(src,verbose=false)
|
98
|
+
@models = {}
|
99
|
+
@verbose = verbose
|
100
|
+
load_all_models(src)
|
101
|
+
@values_map_per_file = {}
|
102
|
+
@value_frequencies = Hash.new {|h,k| h[k]={} }
|
103
|
+
@idf = {}
|
104
|
+
end
|
105
|
+
|
106
|
+
def models
|
107
|
+
@models
|
108
|
+
end
|
109
|
+
|
110
|
+
def sequential_asts
|
111
|
+
sequential_asts = []
|
112
|
+
@nodes_per_file.each do |f,nodes|
|
113
|
+
nodes.each do |n|
|
114
|
+
attribute_values_of_n = n.collect_values_with_count.keys
|
115
|
+
context = context(n) if attribute_values_of_n.count > 0
|
116
|
+
attribute_values_of_n.each do |v|
|
117
|
+
sequential_asts << SequentialAst.new(n,v,context.values)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
sequential_asts
|
122
|
+
end
|
123
|
+
|
124
|
+
def shared_ids
|
125
|
+
@shared_ids = calc_shared_ids unless @shared_ids
|
126
|
+
@shared_ids
|
127
|
+
end
|
128
|
+
|
129
|
+
def files
|
130
|
+
files = Set.new
|
131
|
+
@files_per_values.values.each do |fs|
|
132
|
+
fs.each {|f| files<<f}
|
133
|
+
end
|
134
|
+
files
|
135
|
+
end
|
136
|
+
|
137
|
+
def iter_over_shared_ids_instances(&block)
|
138
|
+
shared_ids.each do |v|
|
139
|
+
iter_value_for_all_extensions(v,&block)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def iter_over_shared_ids_instances_case_insensitive(&block)
|
144
|
+
shared_ids.each do |v|
|
145
|
+
iter_value_for_all_extensions_case_insensitive(v,&block)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def iter_value_for_all_extensions(v,&block)
|
150
|
+
extensions = []
|
151
|
+
@files_per_values[v].each do |el|
|
152
|
+
ext = File.extname(el)
|
153
|
+
extensions << ext unless extensions.include?(ext)
|
154
|
+
end
|
155
|
+
for i in 0...extensions.count
|
156
|
+
ext_i = extensions[i]
|
157
|
+
for j in (i+1)...extensions.count
|
158
|
+
ext_j = extensions[j]
|
159
|
+
iter_value_in_extensions(v,ext_i,ext_j,&block)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def iter_value_for_all_extensions_case_insensitive(v,&block)
|
165
|
+
extensions = []
|
166
|
+
@files_per_values[v].each do |el|
|
167
|
+
ext = File.extname(el)
|
168
|
+
extensions << ext unless extensions.include?(ext)
|
169
|
+
end
|
170
|
+
for i in 0...extensions.count
|
171
|
+
ext_i = extensions[i]
|
172
|
+
for j in (i+1)...extensions.count
|
173
|
+
ext_j = extensions[j]
|
174
|
+
iter_value_in_extensions_case_insensitive(v,ext_i,ext_j,&block)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
def iter_value_in_extensions(v,ext_i,ext_j,&block)
|
180
|
+
raise "Error" if ext_i==ext_j
|
181
|
+
files_i = []
|
182
|
+
files_j = []
|
183
|
+
vs = [v]
|
184
|
+
vs.each do |v_el|
|
185
|
+
@files_per_values[v_el].each do |el|
|
186
|
+
files_i << el if File.extname(el)==ext_i
|
187
|
+
files_j << el if File.extname(el)==ext_j
|
188
|
+
end
|
189
|
+
end
|
190
|
+
nodes_i = []
|
191
|
+
nodes_j = []
|
192
|
+
files_i.each do |f|
|
193
|
+
vs.each do |v_el|
|
194
|
+
@nodes_per_value_and_file_map[v_el][f].each {|n| nodes_i << n}
|
195
|
+
end
|
196
|
+
end
|
197
|
+
files_j.each do |f|
|
198
|
+
vs.each do |v_el|
|
199
|
+
@nodes_per_value_and_file_map[v_el][f].each {|n| nodes_j << n}
|
200
|
+
end
|
201
|
+
end
|
202
|
+
for ni in nodes_i
|
203
|
+
for nj in nodes_j
|
204
|
+
block.call(ni,nj)
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def iter_value_in_extensions_case_insensitive(v,ext_i,ext_j,&block)
|
210
|
+
raise "Error" if ext_i==ext_j
|
211
|
+
files_i = []
|
212
|
+
files_j = []
|
213
|
+
vs = values_case_insensitve(v)
|
214
|
+
vs.each do |v_el|
|
215
|
+
@files_per_values[v_el].each do |el|
|
216
|
+
files_i << el if File.extname(el)==ext_i
|
217
|
+
files_j << el if File.extname(el)==ext_j
|
218
|
+
end
|
219
|
+
end
|
220
|
+
nodes_i = []
|
221
|
+
nodes_j = []
|
222
|
+
files_i.each do |f|
|
223
|
+
vs.each do |v_el|
|
224
|
+
@nodes_per_value_and_file_map[v_el][f].each {|n| nodes_i << n}
|
225
|
+
end
|
226
|
+
end
|
227
|
+
files_j.each do |f|
|
228
|
+
vs.each do |v_el|
|
229
|
+
@nodes_per_value_and_file_map[v_el][f].each {|n| nodes_j << n}
|
230
|
+
end
|
231
|
+
end
|
232
|
+
for ni in nodes_i
|
233
|
+
for nj in nodes_j
|
234
|
+
block.call(ni,nj)
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
def tf_idf(file,value)
|
240
|
+
value_frequency(file,value)*idf(value)
|
241
|
+
end
|
242
|
+
|
243
|
+
def itf_idf(file,value)
|
244
|
+
itf(file,value)*idf(value)
|
245
|
+
end
|
246
|
+
|
247
|
+
private
|
248
|
+
|
249
|
+
def idf(value)
|
250
|
+
unless @idf[value]
|
251
|
+
pos = 0
|
252
|
+
neg = 0
|
253
|
+
files.each do |f|
|
254
|
+
values_per_file(f).has_key?(value) ? pos+=1 : neg+=1
|
255
|
+
end
|
256
|
+
@idf[value] = Math.log((pos+neg).to_f/pos.to_f)
|
257
|
+
end
|
258
|
+
@idf[value]
|
259
|
+
end
|
260
|
+
|
261
|
+
def itf(file,value)
|
262
|
+
Math.log(1.0/value_frequency(file,value))
|
263
|
+
end
|
264
|
+
|
265
|
+
def value_frequency(file,value)
|
266
|
+
unless @value_frequencies[file][value]
|
267
|
+
values_map = values_per_file(file)
|
268
|
+
total = values_map.values.inject(:+)
|
269
|
+
@value_frequencies[file][value] = values_map[value].to_f/total.to_f
|
270
|
+
end
|
271
|
+
@value_frequencies[file][value]
|
272
|
+
end
|
273
|
+
|
274
|
+
def values_per_file(file)
|
275
|
+
unless @values_map_per_file[file]
|
276
|
+
@values_map_per_file[file] = @ml.model(file).collect_values_with_count_subtree(:also_foreign)
|
277
|
+
end
|
278
|
+
@values_map_per_file[file]
|
279
|
+
end
|
280
|
+
|
281
|
+
def values_case_insensitve(v)
|
282
|
+
@files_per_values.keys.select {|el| el.to_s.downcase==v.to_s.downcase}
|
283
|
+
end
|
284
|
+
|
285
|
+
def calc_shared_ids
|
286
|
+
shared = []
|
287
|
+
@files_per_values.each do |v,s|
|
288
|
+
extensions = []
|
289
|
+
s.each do |el|
|
290
|
+
ext = File.extname(el)
|
291
|
+
extensions << ext unless extensions.include?(ext)
|
292
|
+
end
|
293
|
+
if extensions.count>1
|
294
|
+
shared << v
|
295
|
+
end
|
296
|
+
end
|
297
|
+
shared
|
298
|
+
end
|
299
|
+
|
300
|
+
def load_all_models(src)
|
301
|
+
@ml = ModelLoader.new
|
302
|
+
@nodes_per_file = Hash.new {|h,k| h[k] = []}
|
303
|
+
@files_per_values = Hash.new {|h,k| h[k] = Set.new}
|
304
|
+
|
305
|
+
# nodes per value, file
|
306
|
+
@nodes_per_value_and_file_map = Hash.new {|h,k| h[k] = Hash.new {|h,k| h[k] = [] }}
|
307
|
+
|
308
|
+
Dir["#{src}/**/*.html"].each do |f|
|
309
|
+
puts "Loading model from #{f}" if @verbose
|
310
|
+
load_model_from_file(f)
|
311
|
+
end
|
312
|
+
Dir["#{src}/**/*.js"].each do |f|
|
313
|
+
puts "Loading model from #{f}" if @verbose
|
314
|
+
load_model_from_file(f)
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
def load_model_from_file(f)
|
319
|
+
m = @ml.model(f)
|
320
|
+
@models[f]=m
|
321
|
+
m.traverse_also_foreign do |n|
|
322
|
+
@nodes_per_file[f] << n
|
323
|
+
values = n.collect_values_with_count.keys
|
324
|
+
values.each do |v|
|
325
|
+
@files_per_values[v] << f
|
326
|
+
@nodes_per_value_and_file_map[v][f] << n
|
327
|
+
end
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
end
|
332
|
+
|
333
|
+
end
|