browser_web_data_entity_sumarization 1.0.0beta1 → 1.0.0beta2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/browser_web_data_entity_sumarization/entity_sumarization_predicates_similarity.rb +29 -10
- data/lib/browser_web_data_entity_sumarization/entity_sumarization_statistics.rb +28 -15
- data/lib/browser_web_data_entity_sumarization/version.rb +1 -1
- data/lib/config/entity_sumarization_config.rb +2 -0
- data/lib/utils/cache_helper.rb +27 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: df328fc4a3b8ad15cd85f3ef8cc226e8090c0890
|
4
|
+
data.tar.gz: c72c4d29438b7cee335d2d72475f478f0c7da0cd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5824a7f65f83928f79d5099534c892b8f2728c9c24fab480572da8b124c48b53d1d17014d87b09f72e1b18f76f85ff4ddc493f1ab598e782094e18afa23bd126
|
7
|
+
data.tar.gz: cd38b64882daa6f91e3dee375ddc80709b0f69f695c4b117e225b2b1dc88a5ca2f657939a883bc40580d463fe8e1b9b190d8bc3f9f3cc07b43b9251d04496448
|
@@ -22,6 +22,7 @@ module BrowserWebData
|
|
22
22
|
|
23
23
|
load_identical_predicates
|
24
24
|
load_different_predicates
|
25
|
+
load_counts
|
25
26
|
end
|
26
27
|
|
27
28
|
###
|
@@ -32,7 +33,7 @@ module BrowserWebData
|
|
32
33
|
# @return [String] key
|
33
34
|
def self.get_key(predicates)
|
34
35
|
predicates = [predicates] unless predicates.is_a?(Array)
|
35
|
-
"<#{predicates.join('><')}>" if predicates && !predicates.empty?
|
36
|
+
"<#{predicates.sort.join('><')}>" if predicates && !predicates.empty?
|
36
37
|
end
|
37
38
|
|
38
39
|
###
|
@@ -52,9 +53,12 @@ module BrowserWebData
|
|
52
53
|
#
|
53
54
|
# @param [Array<String>] predicates
|
54
55
|
def identify_identical_predicates(predicates, identical_limit = @identical_limit)
|
55
|
-
|
56
|
+
combination = predicates.take(IMPORTANCE_TO_IDENTIFY_MAX_COUNT).map { |p| p.to_sym }.combination(2)
|
57
|
+
five_times_count = combination.size / 20
|
58
|
+
|
59
|
+
combination.each_with_index { |values, i|
|
60
|
+
|
56
61
|
|
57
|
-
predicates.combination(2).each { |values|
|
58
62
|
|
59
63
|
already_mark_same = find_identical(values)
|
60
64
|
already_mark_different = find_different(values)
|
@@ -65,16 +69,16 @@ module BrowserWebData
|
|
65
69
|
# automatically became identical
|
66
70
|
unless try_auto_identical(values)
|
67
71
|
|
68
|
-
unless @
|
69
|
-
@
|
72
|
+
unless @counts[values[0]]
|
73
|
+
@counts[values[0]] = @query.get_count_of_identical_predicates(values[0])
|
70
74
|
end
|
71
75
|
|
72
|
-
unless @
|
73
|
-
@
|
76
|
+
unless @counts[values[1]]
|
77
|
+
@counts[values[1]] = @query.get_count_of_identical_predicates(values[1])
|
74
78
|
end
|
75
79
|
|
76
|
-
x = @
|
77
|
-
y = @
|
80
|
+
x = @counts[values[0]]
|
81
|
+
y = @counts[values[1]]
|
78
82
|
z = @query.get_count_of_identical_predicates(values)
|
79
83
|
|
80
84
|
identical_level = z / [x, y].max
|
@@ -86,12 +90,15 @@ module BrowserWebData
|
|
86
90
|
add_different(values)
|
87
91
|
end
|
88
92
|
end
|
93
|
+
end
|
89
94
|
|
95
|
+
if @console_output && ( i == 0 || (i+1) % five_times_count == 0 )
|
96
|
+
puts "#{Time.now.localtime} | #{(((i+1)/combination.size) * 100).round(2)}% | [#{(i+1)}/#{combination.size}]"
|
90
97
|
end
|
91
98
|
|
92
|
-
true
|
93
99
|
}
|
94
100
|
|
101
|
+
store_counts
|
95
102
|
end
|
96
103
|
|
97
104
|
###
|
@@ -235,6 +242,13 @@ module BrowserWebData
|
|
235
242
|
end
|
236
243
|
end
|
237
244
|
|
245
|
+
def load_counts
|
246
|
+
unless @counts
|
247
|
+
file_path = "#{@results_dir_path}/counts.json"
|
248
|
+
@counts = ensure_load_json(file_path, {})
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
238
252
|
def store_identical_properties
|
239
253
|
File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
|
240
254
|
end
|
@@ -243,6 +257,11 @@ module BrowserWebData
|
|
243
257
|
File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
|
244
258
|
end
|
245
259
|
|
260
|
+
def store_counts
|
261
|
+
File.write("#{@results_dir_path}/counts.json", JSON.generate(@counts))
|
262
|
+
end
|
263
|
+
|
264
|
+
|
246
265
|
def ensure_load_json(file_path, def_val, json_params = {})
|
247
266
|
if File.exists?(file_path)
|
248
267
|
file_data = File.read(file_path).force_encoding('utf-8')
|
@@ -36,7 +36,7 @@ module BrowserWebData
|
|
36
36
|
@console_output = console_output
|
37
37
|
|
38
38
|
@query = SPARQLRequest.new
|
39
|
-
@predicates_similarity = PredicatesSimilarity.new(@results_dir_path)
|
39
|
+
@predicates_similarity = PredicatesSimilarity.new(@results_dir_path, console_output)
|
40
40
|
end
|
41
41
|
|
42
42
|
###
|
@@ -251,25 +251,38 @@ module BrowserWebData
|
|
251
251
|
# to one result knowledge base file.
|
252
252
|
#
|
253
253
|
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
254
|
-
# @param [Fixnum] best_count Define max count of properties that will be assign to entity class type.
|
255
254
|
# @param [TrueClass, FalseClass] identify_identical Flag for process identify and group identical properties as one item.
|
256
|
-
def generate_knowledge_base(type,
|
255
|
+
def generate_knowledge_base(type, identify_identical = true)
|
257
256
|
puts "_____ #{type} _____" if @console_output
|
258
257
|
files = Dir.glob("#{@results_dir_path}/#{type}/*.json")
|
259
258
|
type = type.to_s.to_sym
|
260
259
|
|
261
260
|
knowledge_data = {type => []}
|
262
261
|
|
263
|
-
|
264
|
-
|
265
|
-
|
262
|
+
global_properties = get_global_statistic_by_type(type) || {}
|
263
|
+
|
264
|
+
if identify_identical
|
265
|
+
try_this_identical = {}
|
266
266
|
|
267
|
-
|
267
|
+
files.each { |file_path|
|
268
|
+
file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
|
268
269
|
file_data[:nif_data].each { |data|
|
269
|
-
|
270
|
-
@predicates_similarity.identify_identical_predicates(all_properties)
|
270
|
+
try_this_identical.merge!(data[:properties][type]) { |_, x, y| x + y }
|
271
271
|
}
|
272
|
+
}
|
273
|
+
|
274
|
+
try_this_identical.merge!(global_properties) { |_, x, y| x + y }
|
275
|
+
|
276
|
+
if try_this_identical.size > 0
|
277
|
+
try_this_identical = Hash[try_this_identical.sort_by { |_,v|v}.reverse]
|
278
|
+
puts "- prepare to identify identical: total count #{try_this_identical.size}" if @console_output
|
279
|
+
@predicates_similarity.identify_identical_predicates(try_this_identical.keys)
|
272
280
|
end
|
281
|
+
end
|
282
|
+
|
283
|
+
puts "- calculate: files count #{files.size}" if @console_output
|
284
|
+
files.each { |file_path|
|
285
|
+
file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
|
273
286
|
|
274
287
|
file_data[:nif_data].each { |found|
|
275
288
|
|
@@ -309,10 +322,6 @@ module BrowserWebData
|
|
309
322
|
end
|
310
323
|
}
|
311
324
|
|
312
|
-
global_properties = get_global_statistic_by_type(type) || {}
|
313
|
-
if identify_identical
|
314
|
-
@predicates_similarity.identify_identical_predicates(global_properties.keys)
|
315
|
-
end
|
316
325
|
|
317
326
|
if global_properties.size > 0
|
318
327
|
max_count = global_properties.max_by { |_, count| count }[1].to_f
|
@@ -331,7 +340,11 @@ module BrowserWebData
|
|
331
340
|
hash
|
332
341
|
}
|
333
342
|
|
334
|
-
knowledge_data[type] = knowledge_data[type].
|
343
|
+
knowledge_data[type] = knowledge_data[type].keep_if { |hash|
|
344
|
+
hash[:score] > 0
|
345
|
+
}.sort_by { |hash|
|
346
|
+
hash[:score]
|
347
|
+
}.reverse
|
335
348
|
|
336
349
|
if identify_identical
|
337
350
|
@predicates_similarity.reduce_identical
|
@@ -383,7 +396,7 @@ module BrowserWebData
|
|
383
396
|
# @param [String] path
|
384
397
|
#
|
385
398
|
# @return [Hash] classes
|
386
|
-
def get_all_classes(path = File.join(__dir__,'../knowledge/classes_hierarchy.json'))
|
399
|
+
def get_all_classes(path = File.join(__dir__, '../knowledge/classes_hierarchy.json'))
|
387
400
|
data = ensure_load_json(path, {})
|
388
401
|
HashHelper.recursive_map_keys(data)
|
389
402
|
end
|
data/lib/utils/cache_helper.rb
CHANGED
@@ -55,6 +55,33 @@ module CacheHelper
|
|
55
55
|
HashHelper.recursive_symbolize_keys(hash)
|
56
56
|
end
|
57
57
|
|
58
|
+
###
|
59
|
+
# The method helps to update knowledge by key in yield block.
|
60
|
+
#
|
61
|
+
# @param [String] key Key of stored knowledge.
|
62
|
+
#
|
63
|
+
# @yield param actual_data
|
64
|
+
# @yield return new_data
|
65
|
+
def self.update_knowledge(key)
|
66
|
+
dir_path = "#{File.dirname(File.expand_path('..', __FILE__))}/knowledge"
|
67
|
+
file_path = "#{dir_path}/#{StringHelper.get_clear_file_path(key)}.json"
|
68
|
+
|
69
|
+
hash = {}
|
70
|
+
if !File.exists?(file_path)
|
71
|
+
|
72
|
+
if block_given?
|
73
|
+
hash = yield hash
|
74
|
+
File.open(file_path, 'w') { |f| f.puts hash.to_json } unless hash.empty?
|
75
|
+
end
|
76
|
+
else
|
77
|
+
old_hash = JSON.parse(File.read(file_path).force_encoding('UTF-8'), symbolize_names: true)
|
78
|
+
hash = yield old_hash
|
79
|
+
File.open(file_path, 'w') { |f| f.puts hash.to_json } unless hash.empty?
|
80
|
+
end
|
81
|
+
|
82
|
+
HashHelper.recursive_symbolize_keys(hash)
|
83
|
+
end
|
84
|
+
|
58
85
|
###
|
59
86
|
# The method helps to get build in knowledge by key.
|
60
87
|
#
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: browser_web_data_entity_sumarization
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.0beta2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Marek Filteš
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-04-
|
11
|
+
date: 2017-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|