browser_web_data_entity_sumarization 1.0.0beta1 → 1.0.0beta2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b24bd4b2cd4ae8c908f6549710531303a49c36e5
4
- data.tar.gz: 0c48cc2e23703a28dbe262c3535703b87698f63f
3
+ metadata.gz: df328fc4a3b8ad15cd85f3ef8cc226e8090c0890
4
+ data.tar.gz: c72c4d29438b7cee335d2d72475f478f0c7da0cd
5
5
  SHA512:
6
- metadata.gz: 786bfb65d8250af636e9a1187cc7530738e734252d405c53bc10c0f0c03ba0ea4953fca5b32613817c50c4c67200df09ea31ff96f0a2ec8f0c481b53a965e00b
7
- data.tar.gz: 3448f20b4aa7b7bd457da33479435fd557ef14c87f13264cf3a99286ecd053511cbf341f0ae2f1cb75a516fe5bf4d9e73bd6a0e46b7de701d93d1938f6851c12
6
+ metadata.gz: 5824a7f65f83928f79d5099534c892b8f2728c9c24fab480572da8b124c48b53d1d17014d87b09f72e1b18f76f85ff4ddc493f1ab598e782094e18afa23bd126
7
+ data.tar.gz: cd38b64882daa6f91e3dee375ddc80709b0f69f695c4b117e225b2b1dc88a5ca2f657939a883bc40580d463fe8e1b9b190d8bc3f9f3cc07b43b9251d04496448
@@ -22,6 +22,7 @@ module BrowserWebData
22
22
 
23
23
  load_identical_predicates
24
24
  load_different_predicates
25
+ load_counts
25
26
  end
26
27
 
27
28
  ###
@@ -32,7 +33,7 @@ module BrowserWebData
32
33
  # @return [String] key
33
34
  def self.get_key(predicates)
34
35
  predicates = [predicates] unless predicates.is_a?(Array)
35
- "<#{predicates.join('><')}>" if predicates && !predicates.empty?
36
+ "<#{predicates.sort.join('><')}>" if predicates && !predicates.empty?
36
37
  end
37
38
 
38
39
  ###
@@ -52,9 +53,12 @@ module BrowserWebData
52
53
  #
53
54
  # @param [Array<String>] predicates
54
55
  def identify_identical_predicates(predicates, identical_limit = @identical_limit)
55
- @temp_counts ||= {}
56
+ combination = predicates.take(IMPORTANCE_TO_IDENTIFY_MAX_COUNT).map { |p| p.to_sym }.combination(2)
57
+ five_times_count = combination.size / 20
58
+
59
+ combination.each_with_index { |values, i|
60
+
56
61
 
57
- predicates.combination(2).each { |values|
58
62
 
59
63
  already_mark_same = find_identical(values)
60
64
  already_mark_different = find_different(values)
@@ -65,16 +69,16 @@ module BrowserWebData
65
69
  # automatically became identical
66
70
  unless try_auto_identical(values)
67
71
 
68
- unless @temp_counts[values[0]]
69
- @temp_counts[values[0]] = @query.get_count_of_identical_predicates(values[0])
72
+ unless @counts[values[0]]
73
+ @counts[values[0]] = @query.get_count_of_identical_predicates(values[0])
70
74
  end
71
75
 
72
- unless @temp_counts[values[1]]
73
- @temp_counts[values[1]] = @query.get_count_of_identical_predicates(values[1])
76
+ unless @counts[values[1]]
77
+ @counts[values[1]] = @query.get_count_of_identical_predicates(values[1])
74
78
  end
75
79
 
76
- x = @temp_counts[values[0]]
77
- y = @temp_counts[values[1]]
80
+ x = @counts[values[0]]
81
+ y = @counts[values[1]]
78
82
  z = @query.get_count_of_identical_predicates(values)
79
83
 
80
84
  identical_level = z / [x, y].max
@@ -86,12 +90,15 @@ module BrowserWebData
86
90
  add_different(values)
87
91
  end
88
92
  end
93
+ end
89
94
 
95
+ if @console_output && ( i == 0 || (i+1) % five_times_count == 0 )
96
+ puts "#{Time.now.localtime} | #{(((i+1)/combination.size) * 100).round(2)}% | [#{(i+1)}/#{combination.size}]"
90
97
  end
91
98
 
92
- true
93
99
  }
94
100
 
101
+ store_counts
95
102
  end
96
103
 
97
104
  ###
@@ -235,6 +242,13 @@ module BrowserWebData
235
242
  end
236
243
  end
237
244
 
245
+ def load_counts
246
+ unless @counts
247
+ file_path = "#{@results_dir_path}/counts.json"
248
+ @counts = ensure_load_json(file_path, {})
249
+ end
250
+ end
251
+
238
252
  def store_identical_properties
239
253
  File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
240
254
  end
@@ -243,6 +257,11 @@ module BrowserWebData
243
257
  File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
244
258
  end
245
259
 
260
+ def store_counts
261
+ File.write("#{@results_dir_path}/counts.json", JSON.generate(@counts))
262
+ end
263
+
264
+
246
265
  def ensure_load_json(file_path, def_val, json_params = {})
247
266
  if File.exists?(file_path)
248
267
  file_data = File.read(file_path).force_encoding('utf-8')
@@ -36,7 +36,7 @@ module BrowserWebData
36
36
  @console_output = console_output
37
37
 
38
38
  @query = SPARQLRequest.new
39
- @predicates_similarity = PredicatesSimilarity.new(@results_dir_path)
39
+ @predicates_similarity = PredicatesSimilarity.new(@results_dir_path, console_output)
40
40
  end
41
41
 
42
42
  ###
@@ -251,25 +251,38 @@ module BrowserWebData
251
251
  # to one result knowledge base file.
252
252
  #
253
253
  # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
254
- # @param [Fixnum] best_count Define max count of properties that will be assign to entity class type.
255
254
  # @param [TrueClass, FalseClass] identify_identical Flag for process identify and group identical properties as one item.
256
- def generate_knowledge_base(type, best_count = 20, identify_identical = true)
255
+ def generate_knowledge_base(type, identify_identical = true)
257
256
  puts "_____ #{type} _____" if @console_output
258
257
  files = Dir.glob("#{@results_dir_path}/#{type}/*.json")
259
258
  type = type.to_s.to_sym
260
259
 
261
260
  knowledge_data = {type => []}
262
261
 
263
- files.each { |file_path|
264
- puts "- calculate #{file_path}" if @console_output
265
- file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
262
+ global_properties = get_global_statistic_by_type(type) || {}
263
+
264
+ if identify_identical
265
+ try_this_identical = {}
266
266
 
267
- if identify_identical
267
+ files.each { |file_path|
268
+ file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
268
269
  file_data[:nif_data].each { |data|
269
- all_properties = data[:properties][type].keys + ((data[:strict_properties]||{})[type] || {}).keys.uniq
270
- @predicates_similarity.identify_identical_predicates(all_properties)
270
+ try_this_identical.merge!(data[:properties][type]) { |_, x, y| x + y }
271
271
  }
272
+ }
273
+
274
+ try_this_identical.merge!(global_properties) { |_, x, y| x + y }
275
+
276
+ if try_this_identical.size > 0
277
+ try_this_identical = Hash[try_this_identical.sort_by { |_,v|v}.reverse]
278
+ puts "- prepare to identify identical: total count #{try_this_identical.size}" if @console_output
279
+ @predicates_similarity.identify_identical_predicates(try_this_identical.keys)
272
280
  end
281
+ end
282
+
283
+ puts "- calculate: files count #{files.size}" if @console_output
284
+ files.each { |file_path|
285
+ file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
273
286
 
274
287
  file_data[:nif_data].each { |found|
275
288
 
@@ -309,10 +322,6 @@ module BrowserWebData
309
322
  end
310
323
  }
311
324
 
312
- global_properties = get_global_statistic_by_type(type) || {}
313
- if identify_identical
314
- @predicates_similarity.identify_identical_predicates(global_properties.keys)
315
- end
316
325
 
317
326
  if global_properties.size > 0
318
327
  max_count = global_properties.max_by { |_, count| count }[1].to_f
@@ -331,7 +340,11 @@ module BrowserWebData
331
340
  hash
332
341
  }
333
342
 
334
- knowledge_data[type] = knowledge_data[type].sort_by { |hash| hash[:score] }.reverse.take(best_count)
343
+ knowledge_data[type] = knowledge_data[type].keep_if { |hash|
344
+ hash[:score] > 0
345
+ }.sort_by { |hash|
346
+ hash[:score]
347
+ }.reverse
335
348
 
336
349
  if identify_identical
337
350
  @predicates_similarity.reduce_identical
@@ -383,7 +396,7 @@ module BrowserWebData
383
396
  # @param [String] path
384
397
  #
385
398
  # @return [Hash] classes
386
- def get_all_classes(path = File.join(__dir__,'../knowledge/classes_hierarchy.json'))
399
+ def get_all_classes(path = File.join(__dir__, '../knowledge/classes_hierarchy.json'))
387
400
  data = ensure_load_json(path, {})
388
401
  HashHelper.recursive_map_keys(data)
389
402
  end
@@ -1,3 +1,3 @@
1
1
  module EntitySumarization
2
- VERSION = '1.0.0beta1'
2
+ VERSION = '1.0.0beta2'
3
3
  end
@@ -8,6 +8,8 @@ module BrowserWebData
8
8
 
9
9
  IDENTICAL_PROPERTY_LIMIT = 0.8
10
10
 
11
+ IMPORTANCE_TO_IDENTIFY_MAX_COUNT = 250
12
+
11
13
  NO_SENSE_PROPERTIES = %w(
12
14
  http://xmlns.com/foaf/0.1/primaryTopic
13
15
  http://dbpedia.org/ontology/wikiPageRedirects
@@ -55,6 +55,33 @@ module CacheHelper
55
55
  HashHelper.recursive_symbolize_keys(hash)
56
56
  end
57
57
 
58
+ ###
59
+ # The method helps to update knowledge by key in yield block.
60
+ #
61
+ # @param [String] key Key of stored knowledge.
62
+ #
63
+ # @yield param actual_data
64
+ # @yield return new_data
65
+ def self.update_knowledge(key)
66
+ dir_path = "#{File.dirname(File.expand_path('..', __FILE__))}/knowledge"
67
+ file_path = "#{dir_path}/#{StringHelper.get_clear_file_path(key)}.json"
68
+
69
+ hash = {}
70
+ if !File.exists?(file_path)
71
+
72
+ if block_given?
73
+ hash = yield hash
74
+ File.open(file_path, 'w') { |f| f.puts hash.to_json } unless hash.empty?
75
+ end
76
+ else
77
+ old_hash = JSON.parse(File.read(file_path).force_encoding('UTF-8'), symbolize_names: true)
78
+ hash = yield old_hash
79
+ File.open(file_path, 'w') { |f| f.puts hash.to_json } unless hash.empty?
80
+ end
81
+
82
+ HashHelper.recursive_symbolize_keys(hash)
83
+ end
84
+
58
85
  ###
59
86
  # The method helps to get build in knowledge by key.
60
87
  #
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: browser_web_data_entity_sumarization
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0beta1
4
+ version: 1.0.0beta2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Marek Filteš
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-04-12 00:00:00.000000000 Z
11
+ date: 2017-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement