browser_web_data_entity_sumarization 1.0.0beta1 → 1.0.0beta2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/browser_web_data_entity_sumarization/entity_sumarization_predicates_similarity.rb +29 -10
- data/lib/browser_web_data_entity_sumarization/entity_sumarization_statistics.rb +28 -15
- data/lib/browser_web_data_entity_sumarization/version.rb +1 -1
- data/lib/config/entity_sumarization_config.rb +2 -0
- data/lib/utils/cache_helper.rb +27 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: df328fc4a3b8ad15cd85f3ef8cc226e8090c0890
|
4
|
+
data.tar.gz: c72c4d29438b7cee335d2d72475f478f0c7da0cd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5824a7f65f83928f79d5099534c892b8f2728c9c24fab480572da8b124c48b53d1d17014d87b09f72e1b18f76f85ff4ddc493f1ab598e782094e18afa23bd126
|
7
|
+
data.tar.gz: cd38b64882daa6f91e3dee375ddc80709b0f69f695c4b117e225b2b1dc88a5ca2f657939a883bc40580d463fe8e1b9b190d8bc3f9f3cc07b43b9251d04496448
|
@@ -22,6 +22,7 @@ module BrowserWebData
|
|
22
22
|
|
23
23
|
load_identical_predicates
|
24
24
|
load_different_predicates
|
25
|
+
load_counts
|
25
26
|
end
|
26
27
|
|
27
28
|
###
|
@@ -32,7 +33,7 @@ module BrowserWebData
|
|
32
33
|
# @return [String] key
|
33
34
|
def self.get_key(predicates)
|
34
35
|
predicates = [predicates] unless predicates.is_a?(Array)
|
35
|
-
"<#{predicates.join('><')}>" if predicates && !predicates.empty?
|
36
|
+
"<#{predicates.sort.join('><')}>" if predicates && !predicates.empty?
|
36
37
|
end
|
37
38
|
|
38
39
|
###
|
@@ -52,9 +53,12 @@ module BrowserWebData
|
|
52
53
|
#
|
53
54
|
# @param [Array<String>] predicates
|
54
55
|
def identify_identical_predicates(predicates, identical_limit = @identical_limit)
|
55
|
-
|
56
|
+
combination = predicates.take(IMPORTANCE_TO_IDENTIFY_MAX_COUNT).map { |p| p.to_sym }.combination(2)
|
57
|
+
five_times_count = combination.size / 20
|
58
|
+
|
59
|
+
combination.each_with_index { |values, i|
|
60
|
+
|
56
61
|
|
57
|
-
predicates.combination(2).each { |values|
|
58
62
|
|
59
63
|
already_mark_same = find_identical(values)
|
60
64
|
already_mark_different = find_different(values)
|
@@ -65,16 +69,16 @@ module BrowserWebData
|
|
65
69
|
# automatically became identical
|
66
70
|
unless try_auto_identical(values)
|
67
71
|
|
68
|
-
unless @
|
69
|
-
@
|
72
|
+
unless @counts[values[0]]
|
73
|
+
@counts[values[0]] = @query.get_count_of_identical_predicates(values[0])
|
70
74
|
end
|
71
75
|
|
72
|
-
unless @
|
73
|
-
@
|
76
|
+
unless @counts[values[1]]
|
77
|
+
@counts[values[1]] = @query.get_count_of_identical_predicates(values[1])
|
74
78
|
end
|
75
79
|
|
76
|
-
x = @
|
77
|
-
y = @
|
80
|
+
x = @counts[values[0]]
|
81
|
+
y = @counts[values[1]]
|
78
82
|
z = @query.get_count_of_identical_predicates(values)
|
79
83
|
|
80
84
|
identical_level = z / [x, y].max
|
@@ -86,12 +90,15 @@ module BrowserWebData
|
|
86
90
|
add_different(values)
|
87
91
|
end
|
88
92
|
end
|
93
|
+
end
|
89
94
|
|
95
|
+
if @console_output && ( i == 0 || (i+1) % five_times_count == 0 )
|
96
|
+
puts "#{Time.now.localtime} | #{(((i+1)/combination.size) * 100).round(2)}% | [#{(i+1)}/#{combination.size}]"
|
90
97
|
end
|
91
98
|
|
92
|
-
true
|
93
99
|
}
|
94
100
|
|
101
|
+
store_counts
|
95
102
|
end
|
96
103
|
|
97
104
|
###
|
@@ -235,6 +242,13 @@ module BrowserWebData
|
|
235
242
|
end
|
236
243
|
end
|
237
244
|
|
245
|
+
def load_counts
|
246
|
+
unless @counts
|
247
|
+
file_path = "#{@results_dir_path}/counts.json"
|
248
|
+
@counts = ensure_load_json(file_path, {})
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
238
252
|
def store_identical_properties
|
239
253
|
File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
|
240
254
|
end
|
@@ -243,6 +257,11 @@ module BrowserWebData
|
|
243
257
|
File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
|
244
258
|
end
|
245
259
|
|
260
|
+
def store_counts
|
261
|
+
File.write("#{@results_dir_path}/counts.json", JSON.generate(@counts))
|
262
|
+
end
|
263
|
+
|
264
|
+
|
246
265
|
def ensure_load_json(file_path, def_val, json_params = {})
|
247
266
|
if File.exists?(file_path)
|
248
267
|
file_data = File.read(file_path).force_encoding('utf-8')
|
@@ -36,7 +36,7 @@ module BrowserWebData
|
|
36
36
|
@console_output = console_output
|
37
37
|
|
38
38
|
@query = SPARQLRequest.new
|
39
|
-
@predicates_similarity = PredicatesSimilarity.new(@results_dir_path)
|
39
|
+
@predicates_similarity = PredicatesSimilarity.new(@results_dir_path, console_output)
|
40
40
|
end
|
41
41
|
|
42
42
|
###
|
@@ -251,25 +251,38 @@ module BrowserWebData
|
|
251
251
|
# to one result knowledge base file.
|
252
252
|
#
|
253
253
|
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
254
|
-
# @param [Fixnum] best_count Define max count of properties that will be assign to entity class type.
|
255
254
|
# @param [TrueClass, FalseClass] identify_identical Flag for process identify and group identical properties as one item.
|
256
|
-
def generate_knowledge_base(type,
|
255
|
+
def generate_knowledge_base(type, identify_identical = true)
|
257
256
|
puts "_____ #{type} _____" if @console_output
|
258
257
|
files = Dir.glob("#{@results_dir_path}/#{type}/*.json")
|
259
258
|
type = type.to_s.to_sym
|
260
259
|
|
261
260
|
knowledge_data = {type => []}
|
262
261
|
|
263
|
-
|
264
|
-
|
265
|
-
|
262
|
+
global_properties = get_global_statistic_by_type(type) || {}
|
263
|
+
|
264
|
+
if identify_identical
|
265
|
+
try_this_identical = {}
|
266
266
|
|
267
|
-
|
267
|
+
files.each { |file_path|
|
268
|
+
file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
|
268
269
|
file_data[:nif_data].each { |data|
|
269
|
-
|
270
|
-
@predicates_similarity.identify_identical_predicates(all_properties)
|
270
|
+
try_this_identical.merge!(data[:properties][type]) { |_, x, y| x + y }
|
271
271
|
}
|
272
|
+
}
|
273
|
+
|
274
|
+
try_this_identical.merge!(global_properties) { |_, x, y| x + y }
|
275
|
+
|
276
|
+
if try_this_identical.size > 0
|
277
|
+
try_this_identical = Hash[try_this_identical.sort_by { |_,v|v}.reverse]
|
278
|
+
puts "- prepare to identify identical: total count #{try_this_identical.size}" if @console_output
|
279
|
+
@predicates_similarity.identify_identical_predicates(try_this_identical.keys)
|
272
280
|
end
|
281
|
+
end
|
282
|
+
|
283
|
+
puts "- calculate: files count #{files.size}" if @console_output
|
284
|
+
files.each { |file_path|
|
285
|
+
file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
|
273
286
|
|
274
287
|
file_data[:nif_data].each { |found|
|
275
288
|
|
@@ -309,10 +322,6 @@ module BrowserWebData
|
|
309
322
|
end
|
310
323
|
}
|
311
324
|
|
312
|
-
global_properties = get_global_statistic_by_type(type) || {}
|
313
|
-
if identify_identical
|
314
|
-
@predicates_similarity.identify_identical_predicates(global_properties.keys)
|
315
|
-
end
|
316
325
|
|
317
326
|
if global_properties.size > 0
|
318
327
|
max_count = global_properties.max_by { |_, count| count }[1].to_f
|
@@ -331,7 +340,11 @@ module BrowserWebData
|
|
331
340
|
hash
|
332
341
|
}
|
333
342
|
|
334
|
-
knowledge_data[type] = knowledge_data[type].
|
343
|
+
knowledge_data[type] = knowledge_data[type].keep_if { |hash|
|
344
|
+
hash[:score] > 0
|
345
|
+
}.sort_by { |hash|
|
346
|
+
hash[:score]
|
347
|
+
}.reverse
|
335
348
|
|
336
349
|
if identify_identical
|
337
350
|
@predicates_similarity.reduce_identical
|
@@ -383,7 +396,7 @@ module BrowserWebData
|
|
383
396
|
# @param [String] path
|
384
397
|
#
|
385
398
|
# @return [Hash] classes
|
386
|
-
def get_all_classes(path = File.join(__dir__,'../knowledge/classes_hierarchy.json'))
|
399
|
+
def get_all_classes(path = File.join(__dir__, '../knowledge/classes_hierarchy.json'))
|
387
400
|
data = ensure_load_json(path, {})
|
388
401
|
HashHelper.recursive_map_keys(data)
|
389
402
|
end
|
data/lib/utils/cache_helper.rb
CHANGED
@@ -55,6 +55,33 @@ module CacheHelper
|
|
55
55
|
HashHelper.recursive_symbolize_keys(hash)
|
56
56
|
end
|
57
57
|
|
58
|
+
###
|
59
|
+
# The method helps to update knowledge by key in yield block.
|
60
|
+
#
|
61
|
+
# @param [String] key Key of stored knowledge.
|
62
|
+
#
|
63
|
+
# @yield param actual_data
|
64
|
+
# @yield return new_data
|
65
|
+
def self.update_knowledge(key)
|
66
|
+
dir_path = "#{File.dirname(File.expand_path('..', __FILE__))}/knowledge"
|
67
|
+
file_path = "#{dir_path}/#{StringHelper.get_clear_file_path(key)}.json"
|
68
|
+
|
69
|
+
hash = {}
|
70
|
+
if !File.exists?(file_path)
|
71
|
+
|
72
|
+
if block_given?
|
73
|
+
hash = yield hash
|
74
|
+
File.open(file_path, 'w') { |f| f.puts hash.to_json } unless hash.empty?
|
75
|
+
end
|
76
|
+
else
|
77
|
+
old_hash = JSON.parse(File.read(file_path).force_encoding('UTF-8'), symbolize_names: true)
|
78
|
+
hash = yield old_hash
|
79
|
+
File.open(file_path, 'w') { |f| f.puts hash.to_json } unless hash.empty?
|
80
|
+
end
|
81
|
+
|
82
|
+
HashHelper.recursive_symbolize_keys(hash)
|
83
|
+
end
|
84
|
+
|
58
85
|
###
|
59
86
|
# The method helps to get build in knowledge by key.
|
60
87
|
#
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: browser_web_data_entity_sumarization
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.0beta2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Marek Filteš
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-04-
|
11
|
+
date: 2017-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|