crosslanguagespotter 0.0.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/Gemfile +3 -0
  4. data/Rakefile +13 -0
  5. data/crosslanguagespotter.gemspec +36 -0
  6. data/examples/ex1.rb +13 -0
  7. data/examples/services_example.rb +13 -0
  8. data/lib/crosslanguagespotter/basic.rb +157 -0
  9. data/lib/crosslanguagespotter/context.rb +139 -0
  10. data/lib/crosslanguagespotter/figures_evaluator.rb +160 -0
  11. data/lib/crosslanguagespotter/jaccard.rb +114 -0
  12. data/lib/crosslanguagespotter/methods/context.rb +127 -0
  13. data/lib/crosslanguagespotter/methods/jaro.rb +118 -0
  14. data/lib/crosslanguagespotter/methods/tversky.rb +44 -0
  15. data/lib/crosslanguagespotter/model_loading.rb +333 -0
  16. data/lib/crosslanguagespotter/oracle.rb +261 -0
  17. data/lib/crosslanguagespotter/report.rb +88 -0
  18. data/lib/crosslanguagespotter/version.rb +5 -0
  19. data/lib/crosslanguagespotter/wekaintegration.rb +83 -0
  20. data/lib/crosslanguagespotter.rb +7 -0
  21. data/lib/jars/weka.jar +0 -0
  22. data/resources/css/bootstrap-theme.css +346 -0
  23. data/resources/css/bootstrap-theme.min.css +7 -0
  24. data/resources/css/bootstrap.css +5780 -0
  25. data/resources/css/bootstrap.min.css +7 -0
  26. data/resources/css/highlightstyles/arta.css +160 -0
  27. data/resources/css/highlightstyles/ascetic.css +50 -0
  28. data/resources/css/highlightstyles/atelier-dune.dark.css +93 -0
  29. data/resources/css/highlightstyles/atelier-dune.light.css +93 -0
  30. data/resources/css/highlightstyles/atelier-forest.dark.css +93 -0
  31. data/resources/css/highlightstyles/atelier-forest.light.css +93 -0
  32. data/resources/css/highlightstyles/atelier-heath.dark.css +93 -0
  33. data/resources/css/highlightstyles/atelier-heath.light.css +93 -0
  34. data/resources/css/highlightstyles/atelier-lakeside.dark.css +93 -0
  35. data/resources/css/highlightstyles/atelier-lakeside.light.css +93 -0
  36. data/resources/css/highlightstyles/atelier-seaside.dark.css +93 -0
  37. data/resources/css/highlightstyles/atelier-seaside.light.css +93 -0
  38. data/resources/css/highlightstyles/brown_paper.css +105 -0
  39. data/resources/css/highlightstyles/brown_papersq.png +0 -0
  40. data/resources/css/highlightstyles/dark.css +105 -0
  41. data/resources/css/highlightstyles/default.css +153 -0
  42. data/resources/css/highlightstyles/docco.css +132 -0
  43. data/resources/css/highlightstyles/far.css +113 -0
  44. data/resources/css/highlightstyles/foundation.css +133 -0
  45. data/resources/css/highlightstyles/github.css +125 -0
  46. data/resources/css/highlightstyles/googlecode.css +147 -0
  47. data/resources/css/highlightstyles/idea.css +122 -0
  48. data/resources/css/highlightstyles/ir_black.css +105 -0
  49. data/resources/css/highlightstyles/magula.css +123 -0
  50. data/resources/css/highlightstyles/mono-blue.css +62 -0
  51. data/resources/css/highlightstyles/monokai.css +127 -0
  52. data/resources/css/highlightstyles/monokai_sublime.css +149 -0
  53. data/resources/css/highlightstyles/obsidian.css +154 -0
  54. data/resources/css/highlightstyles/paraiso.dark.css +93 -0
  55. data/resources/css/highlightstyles/paraiso.light.css +93 -0
  56. data/resources/css/highlightstyles/pojoaque.css +106 -0
  57. data/resources/css/highlightstyles/pojoaque.jpg +0 -0
  58. data/resources/css/highlightstyles/railscasts.css +182 -0
  59. data/resources/css/highlightstyles/rainbow.css +112 -0
  60. data/resources/css/highlightstyles/school_book.css +113 -0
  61. data/resources/css/highlightstyles/school_book.png +0 -0
  62. data/resources/css/highlightstyles/solarized_dark.css +107 -0
  63. data/resources/css/highlightstyles/solarized_light.css +107 -0
  64. data/resources/css/highlightstyles/sunburst.css +160 -0
  65. data/resources/css/highlightstyles/tomorrow-night-blue.css +93 -0
  66. data/resources/css/highlightstyles/tomorrow-night-bright.css +92 -0
  67. data/resources/css/highlightstyles/tomorrow-night-eighties.css +92 -0
  68. data/resources/css/highlightstyles/tomorrow-night.css +93 -0
  69. data/resources/css/highlightstyles/tomorrow.css +90 -0
  70. data/resources/css/highlightstyles/vs.css +89 -0
  71. data/resources/css/highlightstyles/xcode.css +158 -0
  72. data/resources/css/highlightstyles/zenburn.css +117 -0
  73. data/resources/example.html +1501 -0
  74. data/resources/js/bootstrap.js +1943 -0
  75. data/resources/js/bootstrap.min.js +7 -0
  76. data/resources/js/highlight.pack.js +1 -0
  77. data/resources/services_example.html +141 -0
  78. data/resources/template.html +61 -0
  79. data/test/data/angular-puzzle.GS +111 -0
  80. data/test/data/angular_puzzle/app.js +66 -0
  81. data/test/data/angular_puzzle/index.html +67 -0
  82. data/test/data/angular_puzzle/slidingPuzzle.js +203 -0
  83. data/test/data/angular_puzzle/wordSearchPuzzle.js +270 -0
  84. data/test/data/example.html +5 -0
  85. data/test/data/example.js +4 -0
  86. data/test/data/services/index.html +33 -0
  87. data/test/data/services/script.js +15 -0
  88. data/test/test_helper.rb +9 -0
  89. data/test/test_parsing.rb +23 -0
  90. data/test/test_spotter.rb +42 -0
  91. data/test/test_wekaintegration.rb +43 -0
  92. metadata +328 -0
@@ -0,0 +1,127 @@
1
+ require 'set'
2
+ require 'crosslanguagespotter/figures_evaluator'
3
+
4
+ module CrossLanguageSpotter
5
+
6
+ class Pair
7
+
8
+ def initialize(a,b)
9
+ raise "error" unless a.source.position(:absolute)
10
+ raise "error" unless b.source.position(:absolute)
11
+ if b.source.artifact(:absolute).filename < a.source.artifact(:absolute).filename
12
+ @nodes = [b,a]
13
+ else
14
+ @nodes = [a,b]
15
+ end
16
+ end
17
+
18
+ def first
19
+ nodes[0]
20
+ end
21
+
22
+ def second
23
+ nodes[1]
24
+ end
25
+
26
+ def nodes
27
+ @nodes
28
+ end
29
+
30
+ def eql?(other)
31
+ return false unless other.is_a?(Pair)
32
+ self.nodes[0]==other.nodes[0] && self.nodes[1]==other.nodes[1]
33
+ end
34
+
35
+ def ==(other)
36
+ self.eql?(other)
37
+ end
38
+
39
+ def to_s
40
+ "[#{nodes[0]} <-> #{nodes[1]}]"
41
+ end
42
+
43
+ def hash
44
+ nodes[0].hash*3+nodes[1].hash
45
+ end
46
+
47
+ end
48
+
49
+ class PointsMap
50
+
51
+ def initialize(alpha)
52
+ @alpha = alpha
53
+ @points = Hash.new {|h,k| h[k]=0.0}
54
+ end
55
+
56
+ def points(pair)
57
+ @points[pair]
58
+ end
59
+
60
+ def register_context_contribute(pair,value)
61
+ @points[pair] += 0.2*@alpha*value.to_f
62
+ end
63
+
64
+ def register_child_contribute(pair)
65
+ @points[pair] += 0.1
66
+ end
67
+
68
+ def each(threshold, &block)
69
+ @points.select{|k,v| v>=threshold}.each(&block)
70
+ end
71
+
72
+ end
73
+
74
+ class ContextReferencesProducer
75
+
76
+ attr_accessor :verbose
77
+
78
+ def initialize(parameters)
79
+ @threshold = parameters[:threshold]
80
+ @verbose = parameters[:verbose]
81
+ @alpha = parameters[:alpha]
82
+ end
83
+
84
+ def points_map(project)
85
+ # fill points map
86
+ points_map = PointsMap.new(@alpha)
87
+ block1 = Proc.new do |ni,nj|
88
+ context_ni = context(ni)
89
+ context_nj = context(nj)
90
+ shared_ctx = context_nj.intersection(context_ni).to_a
91
+ shared_ctx.each do |shared_ctx_entry|
92
+ v = shared_ctx_entry[:value]
93
+ context_ni.declarators_per_value(v).each do |di|
94
+ context_nj.declarators_per_value(v).each do |dj|
95
+ points_map.register_child_contribute(Pair.new(di,dj))
96
+ end
97
+ end
98
+ end
99
+ points_map.register_context_contribute(Pair.new(ni,nj),shared_ctx.count)
100
+ end
101
+ project.iter_over_shared_ids_instances {|ni,nj| block1.call(ni,nj) }
102
+ points_map
103
+ end
104
+
105
+ # It should produce a set of node ids
106
+ def produce_set(project)
107
+ set = Set.new
108
+ puts "Context method:" if @verbose
109
+
110
+ points_map = points_map(project)
111
+
112
+ # look into points map
113
+ points_map.each(@threshold).each do |pair,value|
114
+ f = pair.first
115
+ s = pair.second
116
+ id_i = NodeId.from_node(f)
117
+ id_j = NodeId.from_node(s)
118
+ set << CrossLanguageRelation.new([id_i,id_j])
119
+ end
120
+
121
+ puts "Context method, set produced: #{set.count} elements" if @verbose
122
+ set
123
+ end
124
+
125
+ end
126
+
127
+ end
@@ -0,0 +1,118 @@
1
+ require 'set'
2
+ require 'crosslanguagespotter/figures_evaluator'
3
+
4
+ class Array
5
+ # select array items with index
6
+ # give a block both the item with index of array
7
+ # filtered by a select statement
8
+ def select_with_index
9
+ index = -1
10
+ select { |x| index += 1; yield(x, index) }
11
+ end
12
+
13
+ # return indices array of array item
14
+ # example all indices of a in string "aaabaaabba"
15
+ def aindices(o)
16
+ out = Array.new
17
+ select_with_index { |x, i|
18
+ out << i if x == o }
19
+ out
20
+ end
21
+ end
22
+
23
+ module CrossLanguageSpotter
24
+
25
+ class JaroReferencesProducer
26
+
27
+ attr_accessor :verbose
28
+
29
+ def initialize(parameters)
30
+ @threshold = parameters[:threshold]
31
+ @verbose = parameters[:verbose]
32
+ @winkleradjust = parameters[:winkleradjust]
33
+ end
34
+
35
+ # It should produce a set of node ids
36
+ def produce_set(project)
37
+ set = Set.new
38
+ puts "Jaro method:" if @verbose
39
+
40
+ block1 = Proc.new do |ni,nj|
41
+ context_ni = context(ni).sequence_of_values.map{|v| v.to_s}
42
+ context_nj = context(nj).sequence_of_values.map{|v| v.to_s}
43
+ if jaro_coefficient(context_ni,context_nj)>@threshold
44
+ id_i = NodeId.from_node(ni)
45
+ id_j = NodeId.from_node(nj)
46
+ set << CrossLanguageRelation.new([id_i,id_j])
47
+ end
48
+ end
49
+ project.iter_over_shared_ids_instances {|ni,nj| block1.call(ni,nj) }
50
+ puts "Jaro method, set produced: #{set.count} elements" if @verbose
51
+ set
52
+ end
53
+
54
+ def jaro_coefficient_from_nodes(ni,nj)
55
+ jaro_coefficient_from_context(context(ni),context(nj))
56
+ end
57
+
58
+ def jaro_coefficient_from_context(context_ni,context_nj)
59
+ s1 = context_ni.sequence_of_values.map{|v| v.to_s}
60
+ s2 = context_nj.sequence_of_values.map{|v| v.to_s}
61
+ jaro_coefficient(s1,s2)
62
+ end
63
+
64
+ def jaro_coefficient(s1,s2)
65
+ # if strings (without trailing & leadning spaces) are equal - return 1
66
+ #return 1 if str1.strip==str2.strip
67
+ # either string blank - return 0
68
+ #return 0 if str1.size==0 or str2.size==0
69
+ m = 0 # number of matching chars
70
+ tr = 0 # number of transpositions
71
+
72
+ # get character array length
73
+ s1l = s1.length
74
+ s2l = s2.length
75
+ # str2 should be the longer string
76
+ if s1l > s2l
77
+ s1, s2 = s2, s1
78
+ end
79
+ # hash from all unique str2 chars + occurances
80
+ # example 'aba': hash={ a => 0, b => 0 } a: first occurance, b first occurance
81
+ # if the first a was visited: { a => 1, b => 0} a: second occuance, b second occurance
82
+ found = Hash[*s2.uniq.sort.collect {|v| [v,0]}.flatten]
83
+ # matching distance definition
84
+ md = (([s1l,s2l].max / 2) - 1).to_i
85
+ s1.each_with_index do |c,i|
86
+ # find number of matching chars
87
+ if !found[c].nil? # character exists in str2
88
+ # calculates distance between 2 matching characters compare with md
89
+ if !s2.aindices(c)[found[c]].nil?
90
+ x = (s2.aindices(c)[found[c]] - i).abs
91
+ if x <= md
92
+ found[c] += 1 # increase occurance of character
93
+ m += 1 # increase number of matching characters
94
+ # transpositions?
95
+ if (x != 0)
96
+ tr += 1
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
102
+ tr = (tr/2).to_i
103
+ # calc jaro-distance
104
+ third = 1.0/3
105
+ jd = (third * m / s1l) + (third * m / s2l) + (third * (m - tr) / m)
106
+ out = jd
107
+ # winkleradjust? if first l characters are the same
108
+ if @winkleradjust
109
+ l = 0
110
+ (0..s1l-1).each { |i| s1[i]==s2[i] ? l+=1 : break }
111
+ out = jd + (l * 0.1 * (1 - jd))
112
+ end
113
+ out
114
+ end
115
+
116
+ end
117
+
118
+ end
@@ -0,0 +1,44 @@
1
+ require 'set'
2
+ require 'crosslanguagespotter/jaccard'
3
+ require 'crosslanguagespotter/figures_evaluator'
4
+
5
+ module CrossLanguageSpotter
6
+
7
+ class TverskyReferencesProducer
8
+
9
+ attr_accessor :verbose
10
+
11
+ def initialize(parameters)
12
+ @threshold = parameters[:threshold]
13
+ @alpha = parameters[:alpha]
14
+ end
15
+
16
+ # It should produce a set of node ids
17
+ def produce_set(project)
18
+ set = Set.new
19
+ puts "Tversky method:" if @verbose
20
+ block = Proc.new do |ni,nj|
21
+ context_ni = context(ni).values & project.shared_ids
22
+ context_nj = context(nj).values & project.shared_ids
23
+ j = tversky_coefficient(context_ni,context_nj)
24
+ if j>=@threshold
25
+ id_i = NodeId.from_node(ni)
26
+ id_j = NodeId.from_node(nj)
27
+ puts " * '#{id_i.file}':#{id_i.index} -> '#{id_j.file}':#{id_j.index}" if @verbose
28
+ set << CrossLanguageRelation.new([id_i,id_j])
29
+ end
30
+ end
31
+ project.iter_over_shared_ids_instances {|ni,nj| block.call(ni,nj) }
32
+ puts "Tversky method, set produced: #{set.count} elements" if @verbose
33
+ set
34
+ end
35
+
36
+ def tversky_coefficient(context_ni,context_nj)
37
+ shared = context_ni & context_nj
38
+ others = (context_ni.count-shared.count)+(context_nj.count-shared.count)
39
+ shared.count.to_f/(shared.count.to_f+@alpha*others.to_f)
40
+ end
41
+
42
+ end
43
+
44
+ end
@@ -0,0 +1,333 @@
1
+ require "codemodels"
2
+ require "codemodels/html"
3
+ require "codemodels/js"
4
+ require "crosslanguagespotter/context"
5
+
6
+ module CrossLanguageSpotter
7
+
8
+ AngularParser = CodeModels::Html::AngularJs.parser_considering_angular_embedded_code
9
+
10
+
11
+ def traverse_index(node)
12
+ root = node.root(:also_foreign)
13
+ i = 1
14
+ root.traverse(:also_foreign) do |n|
15
+ return i if (n==node) && (n.source.position(:absolute)==node.source.position(:absolute))
16
+ i+=1
17
+ end
18
+ raise "Error..."
19
+ end
20
+
21
+ def node_at_traverse_index(root,index)
22
+ i = 1
23
+ root.traverse(:also_foreign) do |n|
24
+ return n if (i==index)
25
+ i+=1
26
+ end
27
+ raise "Error... traverse_index: #{index}. Reached #{i}"
28
+ end
29
+
30
+
31
+ def offset_referred_to_host(node)
32
+ base = node.eContainer ? offset_referred_to_host(node.eContainer) : 0
33
+ if node.eContainingFeature && node.eContainingFeature==:foreign_asts
34
+ base+node.eContainer.source.begin_pos.line-1
35
+ else
36
+ base
37
+ end
38
+ end
39
+
40
+ def line_referred_to_host(node,line)
41
+ offset_referred_to_host(node)+line
42
+ end
43
+
44
+ def host_lines(node)
45
+ [line_referred_to_host(node,node.source.begin_line),
46
+ line_referred_to_host(node,node.source.end_line)]
47
+ end
48
+
49
+ def is_in_line?(node,line)
50
+ line>=line_referred_to_host(node,node.source.begin_pos.line) && line<=line_referred_to_host(node,node.source.end_pos.line)
51
+ end
52
+
53
+ class ModelLoader
54
+
55
+ def initialize
56
+ @models = Hash.new do |h,k|
57
+ h[k] = load_model(k)
58
+ end
59
+ end
60
+
61
+ def model(path)
62
+ @models[path]
63
+ end
64
+
65
+ private
66
+
67
+ def load_model(relpath)
68
+ complete_path =relpath
69
+ raise "Unexisting file #{complete_path}" unless File.exist?(complete_path)
70
+ if relpath.end_with?'.html'
71
+ model = AngularParser.parse_file(complete_path)
72
+ elsif relpath.end_with?'.js'
73
+ model = CodeModels.parse_file(complete_path)
74
+ else
75
+ raise "I don't know what to do with: #{complete_path}"
76
+ end
77
+ end
78
+
79
+ end
80
+
81
+ class SequentialAst
82
+
83
+ attr_reader :node
84
+ attr_reader :value
85
+ attr_reader :reference_labels
86
+
87
+ def initialize(node,value,reference_labels)
88
+ @node = node
89
+ @value = value
90
+ @reference_labels = reference_labels
91
+ end
92
+
93
+ end
94
+
95
+ class Project
96
+
97
+ def initialize(src,verbose=false)
98
+ @models = {}
99
+ @verbose = verbose
100
+ load_all_models(src)
101
+ @values_map_per_file = {}
102
+ @value_frequencies = Hash.new {|h,k| h[k]={} }
103
+ @idf = {}
104
+ end
105
+
106
+ def models
107
+ @models
108
+ end
109
+
110
+ def sequential_asts
111
+ sequential_asts = []
112
+ @nodes_per_file.each do |f,nodes|
113
+ nodes.each do |n|
114
+ attribute_values_of_n = n.collect_values_with_count.keys
115
+ context = context(n) if attribute_values_of_n.count > 0
116
+ attribute_values_of_n.each do |v|
117
+ sequential_asts << SequentialAst.new(n,v,context.values)
118
+ end
119
+ end
120
+ end
121
+ sequential_asts
122
+ end
123
+
124
+ def shared_ids
125
+ @shared_ids = calc_shared_ids unless @shared_ids
126
+ @shared_ids
127
+ end
128
+
129
+ def files
130
+ files = Set.new
131
+ @files_per_values.values.each do |fs|
132
+ fs.each {|f| files<<f}
133
+ end
134
+ files
135
+ end
136
+
137
+ def iter_over_shared_ids_instances(&block)
138
+ shared_ids.each do |v|
139
+ iter_value_for_all_extensions(v,&block)
140
+ end
141
+ end
142
+
143
+ def iter_over_shared_ids_instances_case_insensitive(&block)
144
+ shared_ids.each do |v|
145
+ iter_value_for_all_extensions_case_insensitive(v,&block)
146
+ end
147
+ end
148
+
149
+ def iter_value_for_all_extensions(v,&block)
150
+ extensions = []
151
+ @files_per_values[v].each do |el|
152
+ ext = File.extname(el)
153
+ extensions << ext unless extensions.include?(ext)
154
+ end
155
+ for i in 0...extensions.count
156
+ ext_i = extensions[i]
157
+ for j in (i+1)...extensions.count
158
+ ext_j = extensions[j]
159
+ iter_value_in_extensions(v,ext_i,ext_j,&block)
160
+ end
161
+ end
162
+ end
163
+
164
+ def iter_value_for_all_extensions_case_insensitive(v,&block)
165
+ extensions = []
166
+ @files_per_values[v].each do |el|
167
+ ext = File.extname(el)
168
+ extensions << ext unless extensions.include?(ext)
169
+ end
170
+ for i in 0...extensions.count
171
+ ext_i = extensions[i]
172
+ for j in (i+1)...extensions.count
173
+ ext_j = extensions[j]
174
+ iter_value_in_extensions_case_insensitive(v,ext_i,ext_j,&block)
175
+ end
176
+ end
177
+ end
178
+
179
+ def iter_value_in_extensions(v,ext_i,ext_j,&block)
180
+ raise "Error" if ext_i==ext_j
181
+ files_i = []
182
+ files_j = []
183
+ vs = [v]
184
+ vs.each do |v_el|
185
+ @files_per_values[v_el].each do |el|
186
+ files_i << el if File.extname(el)==ext_i
187
+ files_j << el if File.extname(el)==ext_j
188
+ end
189
+ end
190
+ nodes_i = []
191
+ nodes_j = []
192
+ files_i.each do |f|
193
+ vs.each do |v_el|
194
+ @nodes_per_value_and_file_map[v_el][f].each {|n| nodes_i << n}
195
+ end
196
+ end
197
+ files_j.each do |f|
198
+ vs.each do |v_el|
199
+ @nodes_per_value_and_file_map[v_el][f].each {|n| nodes_j << n}
200
+ end
201
+ end
202
+ for ni in nodes_i
203
+ for nj in nodes_j
204
+ block.call(ni,nj)
205
+ end
206
+ end
207
+ end
208
+
209
+ def iter_value_in_extensions_case_insensitive(v,ext_i,ext_j,&block)
210
+ raise "Error" if ext_i==ext_j
211
+ files_i = []
212
+ files_j = []
213
+ vs = values_case_insensitve(v)
214
+ vs.each do |v_el|
215
+ @files_per_values[v_el].each do |el|
216
+ files_i << el if File.extname(el)==ext_i
217
+ files_j << el if File.extname(el)==ext_j
218
+ end
219
+ end
220
+ nodes_i = []
221
+ nodes_j = []
222
+ files_i.each do |f|
223
+ vs.each do |v_el|
224
+ @nodes_per_value_and_file_map[v_el][f].each {|n| nodes_i << n}
225
+ end
226
+ end
227
+ files_j.each do |f|
228
+ vs.each do |v_el|
229
+ @nodes_per_value_and_file_map[v_el][f].each {|n| nodes_j << n}
230
+ end
231
+ end
232
+ for ni in nodes_i
233
+ for nj in nodes_j
234
+ block.call(ni,nj)
235
+ end
236
+ end
237
+ end
238
+
239
+ def tf_idf(file,value)
240
+ value_frequency(file,value)*idf(value)
241
+ end
242
+
243
+ def itf_idf(file,value)
244
+ itf(file,value)*idf(value)
245
+ end
246
+
247
+ private
248
+
249
+ def idf(value)
250
+ unless @idf[value]
251
+ pos = 0
252
+ neg = 0
253
+ files.each do |f|
254
+ values_per_file(f).has_key?(value) ? pos+=1 : neg+=1
255
+ end
256
+ @idf[value] = Math.log((pos+neg).to_f/pos.to_f)
257
+ end
258
+ @idf[value]
259
+ end
260
+
261
+ def itf(file,value)
262
+ Math.log(1.0/value_frequency(file,value))
263
+ end
264
+
265
+ def value_frequency(file,value)
266
+ unless @value_frequencies[file][value]
267
+ values_map = values_per_file(file)
268
+ total = values_map.values.inject(:+)
269
+ @value_frequencies[file][value] = values_map[value].to_f/total.to_f
270
+ end
271
+ @value_frequencies[file][value]
272
+ end
273
+
274
+ def values_per_file(file)
275
+ unless @values_map_per_file[file]
276
+ @values_map_per_file[file] = @ml.model(file).collect_values_with_count_subtree(:also_foreign)
277
+ end
278
+ @values_map_per_file[file]
279
+ end
280
+
281
+ def values_case_insensitve(v)
282
+ @files_per_values.keys.select {|el| el.to_s.downcase==v.to_s.downcase}
283
+ end
284
+
285
+ def calc_shared_ids
286
+ shared = []
287
+ @files_per_values.each do |v,s|
288
+ extensions = []
289
+ s.each do |el|
290
+ ext = File.extname(el)
291
+ extensions << ext unless extensions.include?(ext)
292
+ end
293
+ if extensions.count>1
294
+ shared << v
295
+ end
296
+ end
297
+ shared
298
+ end
299
+
300
+ def load_all_models(src)
301
+ @ml = ModelLoader.new
302
+ @nodes_per_file = Hash.new {|h,k| h[k] = []}
303
+ @files_per_values = Hash.new {|h,k| h[k] = Set.new}
304
+
305
+ # nodes per value, file
306
+ @nodes_per_value_and_file_map = Hash.new {|h,k| h[k] = Hash.new {|h,k| h[k] = [] }}
307
+
308
+ Dir["#{src}/**/*.html"].each do |f|
309
+ puts "Loading model from #{f}" if @verbose
310
+ load_model_from_file(f)
311
+ end
312
+ Dir["#{src}/**/*.js"].each do |f|
313
+ puts "Loading model from #{f}" if @verbose
314
+ load_model_from_file(f)
315
+ end
316
+ end
317
+
318
+ def load_model_from_file(f)
319
+ m = @ml.model(f)
320
+ @models[f]=m
321
+ m.traverse_also_foreign do |n|
322
+ @nodes_per_file[f] << n
323
+ values = n.collect_values_with_count.keys
324
+ values.each do |v|
325
+ @files_per_values[v] << f
326
+ @nodes_per_value_and_file_map[v][f] << n
327
+ end
328
+ end
329
+ end
330
+
331
+ end
332
+
333
+ end