browser_web_data_entity_sumarization 1.0.0beta1 → 1.0.0beta2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b24bd4b2cd4ae8c908f6549710531303a49c36e5
4
- data.tar.gz: 0c48cc2e23703a28dbe262c3535703b87698f63f
3
+ metadata.gz: df328fc4a3b8ad15cd85f3ef8cc226e8090c0890
4
+ data.tar.gz: c72c4d29438b7cee335d2d72475f478f0c7da0cd
5
5
  SHA512:
6
- metadata.gz: 786bfb65d8250af636e9a1187cc7530738e734252d405c53bc10c0f0c03ba0ea4953fca5b32613817c50c4c67200df09ea31ff96f0a2ec8f0c481b53a965e00b
7
- data.tar.gz: 3448f20b4aa7b7bd457da33479435fd557ef14c87f13264cf3a99286ecd053511cbf341f0ae2f1cb75a516fe5bf4d9e73bd6a0e46b7de701d93d1938f6851c12
6
+ metadata.gz: 5824a7f65f83928f79d5099534c892b8f2728c9c24fab480572da8b124c48b53d1d17014d87b09f72e1b18f76f85ff4ddc493f1ab598e782094e18afa23bd126
7
+ data.tar.gz: cd38b64882daa6f91e3dee375ddc80709b0f69f695c4b117e225b2b1dc88a5ca2f657939a883bc40580d463fe8e1b9b190d8bc3f9f3cc07b43b9251d04496448
@@ -22,6 +22,7 @@ module BrowserWebData
22
22
 
23
23
  load_identical_predicates
24
24
  load_different_predicates
25
+ load_counts
25
26
  end
26
27
 
27
28
  ###
@@ -32,7 +33,7 @@ module BrowserWebData
32
33
  # @return [String] key
33
34
  def self.get_key(predicates)
34
35
  predicates = [predicates] unless predicates.is_a?(Array)
35
- "<#{predicates.join('><')}>" if predicates && !predicates.empty?
36
+ "<#{predicates.sort.join('><')}>" if predicates && !predicates.empty?
36
37
  end
37
38
 
38
39
  ###
@@ -52,9 +53,12 @@ module BrowserWebData
52
53
  #
53
54
  # @param [Array<String>] predicates
54
55
  def identify_identical_predicates(predicates, identical_limit = @identical_limit)
55
- @temp_counts ||= {}
56
+ combination = predicates.take(IMPORTANCE_TO_IDENTIFY_MAX_COUNT).map { |p| p.to_sym }.combination(2)
57
+ five_times_count = combination.size / 20
58
+
59
+ combination.each_with_index { |values, i|
60
+
56
61
 
57
- predicates.combination(2).each { |values|
58
62
 
59
63
  already_mark_same = find_identical(values)
60
64
  already_mark_different = find_different(values)
@@ -65,16 +69,16 @@ module BrowserWebData
65
69
  # automatically became identical
66
70
  unless try_auto_identical(values)
67
71
 
68
- unless @temp_counts[values[0]]
69
- @temp_counts[values[0]] = @query.get_count_of_identical_predicates(values[0])
72
+ unless @counts[values[0]]
73
+ @counts[values[0]] = @query.get_count_of_identical_predicates(values[0])
70
74
  end
71
75
 
72
- unless @temp_counts[values[1]]
73
- @temp_counts[values[1]] = @query.get_count_of_identical_predicates(values[1])
76
+ unless @counts[values[1]]
77
+ @counts[values[1]] = @query.get_count_of_identical_predicates(values[1])
74
78
  end
75
79
 
76
- x = @temp_counts[values[0]]
77
- y = @temp_counts[values[1]]
80
+ x = @counts[values[0]]
81
+ y = @counts[values[1]]
78
82
  z = @query.get_count_of_identical_predicates(values)
79
83
 
80
84
  identical_level = z / [x, y].max
@@ -86,12 +90,15 @@ module BrowserWebData
86
90
  add_different(values)
87
91
  end
88
92
  end
93
+ end
89
94
 
95
+ if @console_output && ( i == 0 || (i+1) % five_times_count == 0 )
96
+ puts "#{Time.now.localtime} | #{(((i+1)/combination.size) * 100).round(2)}% | [#{(i+1)}/#{combination.size}]"
90
97
  end
91
98
 
92
- true
93
99
  }
94
100
 
101
+ store_counts
95
102
  end
96
103
 
97
104
  ###
@@ -235,6 +242,13 @@ module BrowserWebData
235
242
  end
236
243
  end
237
244
 
245
+ def load_counts
246
+ unless @counts
247
+ file_path = "#{@results_dir_path}/counts.json"
248
+ @counts = ensure_load_json(file_path, {})
249
+ end
250
+ end
251
+
238
252
  def store_identical_properties
239
253
  File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
240
254
  end
@@ -243,6 +257,11 @@ module BrowserWebData
243
257
  File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
244
258
  end
245
259
 
260
+ def store_counts
261
+ File.write("#{@results_dir_path}/counts.json", JSON.generate(@counts))
262
+ end
263
+
264
+
246
265
  def ensure_load_json(file_path, def_val, json_params = {})
247
266
  if File.exists?(file_path)
248
267
  file_data = File.read(file_path).force_encoding('utf-8')
@@ -36,7 +36,7 @@ module BrowserWebData
36
36
  @console_output = console_output
37
37
 
38
38
  @query = SPARQLRequest.new
39
- @predicates_similarity = PredicatesSimilarity.new(@results_dir_path)
39
+ @predicates_similarity = PredicatesSimilarity.new(@results_dir_path, console_output)
40
40
  end
41
41
 
42
42
  ###
@@ -251,25 +251,38 @@ module BrowserWebData
251
251
  # to one result knowledge base file.
252
252
  #
253
253
  # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
254
- # @param [Fixnum] best_count Define max count of properties that will be assign to entity class type.
255
254
  # @param [TrueClass, FalseClass] identify_identical Flag for process identify and group identical properties as one item.
256
- def generate_knowledge_base(type, best_count = 20, identify_identical = true)
255
+ def generate_knowledge_base(type, identify_identical = true)
257
256
  puts "_____ #{type} _____" if @console_output
258
257
  files = Dir.glob("#{@results_dir_path}/#{type}/*.json")
259
258
  type = type.to_s.to_sym
260
259
 
261
260
  knowledge_data = {type => []}
262
261
 
263
- files.each { |file_path|
264
- puts "- calculate #{file_path}" if @console_output
265
- file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
262
+ global_properties = get_global_statistic_by_type(type) || {}
263
+
264
+ if identify_identical
265
+ try_this_identical = {}
266
266
 
267
- if identify_identical
267
+ files.each { |file_path|
268
+ file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
268
269
  file_data[:nif_data].each { |data|
269
- all_properties = data[:properties][type].keys + ((data[:strict_properties]||{})[type] || {}).keys.uniq
270
- @predicates_similarity.identify_identical_predicates(all_properties)
270
+ try_this_identical.merge!(data[:properties][type]) { |_, x, y| x + y }
271
271
  }
272
+ }
273
+
274
+ try_this_identical.merge!(global_properties) { |_, x, y| x + y }
275
+
276
+ if try_this_identical.size > 0
277
+ try_this_identical = Hash[try_this_identical.sort_by { |_,v|v}.reverse]
278
+ puts "- prepare to identify identical: total count #{try_this_identical.size}" if @console_output
279
+ @predicates_similarity.identify_identical_predicates(try_this_identical.keys)
272
280
  end
281
+ end
282
+
283
+ puts "- calculate: files count #{files.size}" if @console_output
284
+ files.each { |file_path|
285
+ file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
273
286
 
274
287
  file_data[:nif_data].each { |found|
275
288
 
@@ -309,10 +322,6 @@ module BrowserWebData
309
322
  end
310
323
  }
311
324
 
312
- global_properties = get_global_statistic_by_type(type) || {}
313
- if identify_identical
314
- @predicates_similarity.identify_identical_predicates(global_properties.keys)
315
- end
316
325
 
317
326
  if global_properties.size > 0
318
327
  max_count = global_properties.max_by { |_, count| count }[1].to_f
@@ -331,7 +340,11 @@ module BrowserWebData
331
340
  hash
332
341
  }
333
342
 
334
- knowledge_data[type] = knowledge_data[type].sort_by { |hash| hash[:score] }.reverse.take(best_count)
343
+ knowledge_data[type] = knowledge_data[type].keep_if { |hash|
344
+ hash[:score] > 0
345
+ }.sort_by { |hash|
346
+ hash[:score]
347
+ }.reverse
335
348
 
336
349
  if identify_identical
337
350
  @predicates_similarity.reduce_identical
@@ -383,7 +396,7 @@ module BrowserWebData
383
396
  # @param [String] path
384
397
  #
385
398
  # @return [Hash] classes
386
- def get_all_classes(path = File.join(__dir__,'../knowledge/classes_hierarchy.json'))
399
+ def get_all_classes(path = File.join(__dir__, '../knowledge/classes_hierarchy.json'))
387
400
  data = ensure_load_json(path, {})
388
401
  HashHelper.recursive_map_keys(data)
389
402
  end
@@ -1,3 +1,3 @@
1
1
  module EntitySumarization
2
- VERSION = '1.0.0beta1'
2
+ VERSION = '1.0.0beta2'
3
3
  end
@@ -8,6 +8,8 @@ module BrowserWebData
8
8
 
9
9
  IDENTICAL_PROPERTY_LIMIT = 0.8
10
10
 
11
+ IMPORTANCE_TO_IDENTIFY_MAX_COUNT = 250
12
+
11
13
  NO_SENSE_PROPERTIES = %w(
12
14
  http://xmlns.com/foaf/0.1/primaryTopic
13
15
  http://dbpedia.org/ontology/wikiPageRedirects
@@ -55,6 +55,33 @@ module CacheHelper
55
55
  HashHelper.recursive_symbolize_keys(hash)
56
56
  end
57
57
 
58
+ ###
59
+ # The method helps to update knowledge by key in yield block.
60
+ #
61
+ # @param [String] key Key of stored knowledge.
62
+ #
63
+ # @yield param actual_data
64
+ # @yield return new_data
65
+ def self.update_knowledge(key)
66
+ dir_path = "#{File.dirname(File.expand_path('..', __FILE__))}/knowledge"
67
+ file_path = "#{dir_path}/#{StringHelper.get_clear_file_path(key)}.json"
68
+
69
+ hash = {}
70
+ if !File.exists?(file_path)
71
+
72
+ if block_given?
73
+ hash = yield hash
74
+ File.open(file_path, 'w') { |f| f.puts hash.to_json } unless hash.empty?
75
+ end
76
+ else
77
+ old_hash = JSON.parse(File.read(file_path).force_encoding('UTF-8'), symbolize_names: true)
78
+ hash = yield old_hash
79
+ File.open(file_path, 'w') { |f| f.puts hash.to_json } unless hash.empty?
80
+ end
81
+
82
+ HashHelper.recursive_symbolize_keys(hash)
83
+ end
84
+
58
85
  ###
59
86
  # The method helps to get build in knowledge by key.
60
87
  #
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: browser_web_data_entity_sumarization
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0beta1
4
+ version: 1.0.0beta2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Marek Filteš
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-04-12 00:00:00.000000000 Z
11
+ date: 2017-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement