browser_web_data_entity_sumarization 1.0.0beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b24bd4b2cd4ae8c908f6549710531303a49c36e5
4
+ data.tar.gz: 0c48cc2e23703a28dbe262c3535703b87698f63f
5
+ SHA512:
6
+ metadata.gz: 786bfb65d8250af636e9a1187cc7530738e734252d405c53bc10c0f0c03ba0ea4953fca5b32613817c50c4c67200df09ea31ff96f0a2ec8f0c481b53a965e00b
7
+ data.tar.gz: 3448f20b4aa7b7bd457da33479435fd557ef14c87f13264cf3a99286ecd053511cbf341f0ae2f1cb75a516fe5bf4d9e73bd6a0e46b7de701d93d1938f6851c12
@@ -0,0 +1,23 @@
1
+ #encoding: utf-8
2
+ require 'sparql/client'
3
+ require 'benchmark'
4
+ require 'json'
5
+
6
+ module BrowserWebData
7
+ module EntitySumarization
8
+
9
+ end
10
+ end
11
+
12
+ Dir.glob(File.dirname(__FILE__) + '/utils/*.rb').each { |file| require file }
13
+ Dir.glob(File.dirname(__FILE__) + '/config/*.rb').each { |file| require file }
14
+
15
+ # Require all gem scripts by their relative names
16
+ Dir[File.dirname(__FILE__) + '/browser_web_data_entity_sumarization/**/*.rb'].each do |file|
17
+ require(file.gsub('\\', '/').split('/lib/').last[0..-4])
18
+ end
19
+
20
+
21
+
22
+
23
+
@@ -0,0 +1,59 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # Core project module
5
+ module BrowserWebData
6
+
7
+ ###
8
+ # Project logic module
9
+ module EntitySumarization
10
+
11
+ ###
12
+ # The class include helpers to retrieve structured nif data from nif lines.
13
+ class NIFLineParser
14
+ include BrowserWebData::EntitySumarizationConfig
15
+
16
+ ###
17
+ # The method apply scan to recognize resource uri from given nif dataset line.
18
+ #
19
+ # @param [String] line
20
+ #
21
+ # @return [String] resource_uri
22
+ # @example resource_uri: "http://dbpedia.org/resource/Captain_EO"
23
+ def self.parse_resource_uri(line)
24
+ (line.scan(SCAN_REGEXP[:scan_resource])[0])[0].split('?').first
25
+ end
26
+
27
+ ###
28
+ # The method apply scan to recognize link, anchor, indexes and section from given nif dataset group of 7 lines.
29
+ #
30
+ # @param [Array<String>] lines_group
31
+ #
32
+ # @return [Hash] nif_data
33
+ # @example nif_data:
34
+ # {
35
+ # link: "http://dbpedia.org/resource/Science_fiction_film",
36
+ # anchor: "science fiction film",
37
+ # indexes: ["33", "53"],
38
+ # section: "paragraph_0_419"
39
+ # }
40
+ def self.parse_line_group(lines_group)
41
+ begin_index = lines_group[2].scan(SCAN_REGEXP[:begin_index])[0]
42
+ end_index = lines_group[3].scan(SCAN_REGEXP[:end_index])[0]
43
+ target_resource_link = lines_group[5].scan(SCAN_REGEXP[:target_resource_link])[0]
44
+ section = lines_group[4].scan(SCAN_REGEXP[:section])[0]
45
+ anchor = lines_group[6].scan(SCAN_REGEXP[:anchor])[0]
46
+
47
+ {
48
+ link: target_resource_link[1].force_encoding('utf-8'),
49
+ anchor: anchor[1].force_encoding('utf-8'),
50
+ indexes: [begin_index[1], end_index[1]],
51
+ section: section[0].split('=')[1]
52
+ }
53
+ end
54
+
55
+ end
56
+
57
+ end
58
+
59
+ end
@@ -0,0 +1,33 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # Core project module
5
+ module BrowserWebData
6
+
7
+ ###
8
+ # Project logic module
9
+ module EntitySumarization
10
+
11
+
12
+ ###
13
+ # The class include helper methods.
14
+ # (todo definition of predicate instance)
15
+ class Predicate
16
+ include BrowserWebData::EntitySumarizationConfig
17
+
18
+ ###
19
+ # The method helps identify unimportant predicate by constants.
20
+ #
21
+ # @param [String] property
22
+ #
23
+ # @return [TrueClass, FalseClass] result
24
+ def self.unimportant?(property)
25
+ property = property.to_s
26
+ NO_SENSE_PROPERTIES.include?(property) || COMMON_PROPERTIES.include?(property)
27
+ end
28
+
29
+ end
30
+
31
+ end
32
+
33
+ end
@@ -0,0 +1,263 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # Core project module
5
+ module BrowserWebData
6
+
7
+ ###
8
+ # Project logic module
9
+ module EntitySumarization
10
+
11
+ ###
12
+ # The class include methods to identify identical predicates
13
+ class PredicatesSimilarity
14
+ include BrowserWebData::EntitySumarizationConfig
15
+
16
+ def initialize(results_dir_path, identical_limit = IDENTICAL_PROPERTY_LIMIT, console_output = false)
17
+ @results_dir_path = results_dir_path
18
+ @console_output = console_output
19
+ @identical_limit = identical_limit
20
+
21
+ @query = SPARQLRequest.new
22
+
23
+ load_identical_predicates
24
+ load_different_predicates
25
+ end
26
+
27
+ ###
28
+ # The method return key of identical predicates
29
+ #
30
+ # @param [Array<String>] predicates
31
+ #
32
+ # @return [String] key
33
+ def self.get_key(predicates)
34
+ predicates = [predicates] unless predicates.is_a?(Array)
35
+ "<#{predicates.join('><')}>" if predicates && !predicates.empty?
36
+ end
37
+
38
+ ###
39
+ # The method return identical predicates by key
40
+ #
41
+ # @param [String] key
42
+ #
43
+ # @return [Array<String>] predicates
44
+ def self.parse_key(key)
45
+ key.to_s.scan(SCAN_REGEXP[:identical_key]).reduce(:+)
46
+ end
47
+
48
+ ###
49
+ # The method verify every combination of two predicates.
50
+ # Method store identify combination in two files identical_predicates.json and different_predicates.json
51
+ # files contains Array of combination keys
52
+ #
53
+ # @param [Array<String>] predicates
54
+ def identify_identical_predicates(predicates, identical_limit = @identical_limit)
55
+ @temp_counts ||= {}
56
+
57
+ predicates.combination(2).each { |values|
58
+
59
+ already_mark_same = find_identical(values)
60
+ already_mark_different = find_different(values)
61
+
62
+ if already_mark_same.nil? && already_mark_different.nil?
63
+
64
+ # in case of dbpedia ontology vs. property
65
+ # automatically became identical
66
+ unless try_auto_identical(values)
67
+
68
+ unless @temp_counts[values[0]]
69
+ @temp_counts[values[0]] = @query.get_count_of_identical_predicates(values[0])
70
+ end
71
+
72
+ unless @temp_counts[values[1]]
73
+ @temp_counts[values[1]] = @query.get_count_of_identical_predicates(values[1])
74
+ end
75
+
76
+ x = @temp_counts[values[0]]
77
+ y = @temp_counts[values[1]]
78
+ z = @query.get_count_of_identical_predicates(values)
79
+
80
+ identical_level = z / [x, y].max
81
+
82
+ if identical_level >= identical_limit
83
+ puts " - result[#{identical_level}] z[#{z}] x[#{x}] y[#{y}] #{values.inspect}" if @console_output
84
+ add_identical(values)
85
+ else
86
+ add_different(values)
87
+ end
88
+ end
89
+
90
+ end
91
+
92
+ true
93
+ }
94
+
95
+ end
96
+
97
+ ###
98
+ # The method helps to recognize if is already marked as identical properties
99
+ #
100
+ # @param [Array<String>, String] value
101
+ #
102
+ # @return [String, NilClass]
103
+ def find_identical(value)
104
+ raise RuntimeError.new('No support identify identical for more than 2 predicates.') if value.is_a?(Array) && value.size >2
105
+
106
+ predicates_key = case value
107
+ when Array
108
+ value = value.map { |v| PredicatesSimilarity.get_key(v) }
109
+ @identical_predicates.find { |p|
110
+ p[value[0]] && p[value[1]]
111
+ }
112
+ else
113
+ value = PredicatesSimilarity.get_key(value)
114
+ @identical_predicates.find { |p|
115
+ p[value]
116
+ }
117
+ end
118
+
119
+ PredicatesSimilarity.parse_key(predicates_key)
120
+ end
121
+
122
+ ###
123
+ # The method helps to recognize if is already marked as different properties
124
+ #
125
+ # @param [Array<String>, String] value
126
+ #
127
+ # @return [String, NilClass]
128
+ def find_different(value)
129
+ raise RuntimeError.new('No support identify identical for more than 2 predicates.') if value.is_a?(Array) && value.size >2
130
+
131
+ key = case value
132
+ when Array
133
+ value = value.map { |v| PredicatesSimilarity.get_key(v) }
134
+ @different_predicates.find { |p| p[value[0]] && p[value[1]] }
135
+ else
136
+ value = PredicatesSimilarity.get_key(value)
137
+ @different_predicates.find { |p| p[value] }
138
+ end
139
+
140
+ PredicatesSimilarity.parse_key(key)
141
+ end
142
+
143
+ def add_identical(values)
144
+ values = values.map { |p| p.to_s }.uniq.sort
145
+ group_key = PredicatesSimilarity.get_key(values)
146
+
147
+ unless @identical_predicates.include?(group_key)
148
+ @identical_predicates << group_key
149
+ store_identical_properties
150
+ end
151
+ end
152
+
153
+ def add_different(values)
154
+ values = values.map { |p| p.to_s }.uniq.sort
155
+ group_key = PredicatesSimilarity.get_key(values)
156
+
157
+ unless @different_predicates.include?(group_key)
158
+ @different_predicates << group_key
159
+
160
+ @new_diff_counter ||= 0
161
+ @new_diff_counter += 1
162
+
163
+ if @new_diff_counter > 100
164
+ store_different_predicates
165
+ @new_diff_counter = 0
166
+ end
167
+
168
+ end
169
+ end
170
+
171
+ def try_auto_identical(values)
172
+ group_key = PredicatesSimilarity.get_key(values)
173
+
174
+ temp = values.map { |val| val.to_s.split('/').last }.uniq
175
+ if temp.size == 1 && group_key['property/'] && group_key['ontology/']
176
+ add_identical(values)
177
+ true
178
+ else
179
+ false
180
+ end
181
+ end
182
+
183
+
184
+ ###
185
+ # The method helps to reduce identical predicates by join of common predicate
186
+ def reduce_identical
187
+ new_identical = []
188
+
189
+ @identical_predicates.each { |key|
190
+ values = PredicatesSimilarity.parse_key(key)
191
+ next if new_identical.find { |v| !(v & values).empty? }
192
+
193
+ ## find nodes with values predicates
194
+ values = recursive_find_identical(key, values)
195
+
196
+ new_identical << values.uniq.sort
197
+ }
198
+
199
+ @identical_predicates = new_identical.map { |v| PredicatesSimilarity.get_key(v) }
200
+
201
+ store_identical_properties
202
+ end
203
+
204
+ def recursive_find_identical(keys, values)
205
+ keys = [keys] unless keys.is_a?(Array)
206
+
207
+ @identical_predicates.each { |this_key|
208
+ next if keys.include?(this_key)
209
+ temp = PredicatesSimilarity.parse_key(this_key)
210
+
211
+ unless (temp & values).empty?
212
+ keys << this_key
213
+ return recursive_find_identical(keys, (values + temp).uniq)
214
+ end
215
+ }
216
+
217
+ values
218
+ end
219
+
220
+
221
+ private
222
+
223
+
224
+ def load_identical_predicates
225
+ unless @identical_predicates
226
+ file_path = "#{@results_dir_path}/identical_predicates.json"
227
+ @identical_predicates = ensure_load_json(file_path, [])
228
+ end
229
+ end
230
+
231
+ def load_different_predicates
232
+ unless @different_predicates
233
+ file_path = "#{@results_dir_path}/different_predicates.json"
234
+ @different_predicates = ensure_load_json(file_path, [])
235
+ end
236
+ end
237
+
238
+ def store_identical_properties
239
+ File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
240
+ end
241
+
242
+ def store_different_predicates
243
+ File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
244
+ end
245
+
246
+ def ensure_load_json(file_path, def_val, json_params = {})
247
+ if File.exists?(file_path)
248
+ file_data = File.read(file_path).force_encoding('utf-8')
249
+ if file_data.size >= 2 # '[]'
250
+ JSON.parse(file_data, json_params)
251
+ else
252
+ def_val
253
+ end
254
+ else
255
+ def_val
256
+ end
257
+ end
258
+
259
+ end
260
+
261
+ end
262
+
263
+ end
@@ -0,0 +1,609 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # Core project module
5
+ module BrowserWebData
6
+
7
+ ###
8
+ # Project logic module
9
+ module EntitySumarization
10
+
11
+ ###
12
+ # Statistic class allow to find, collect and generate knowledge of entity sumarization.
13
+ # Entity sumarization is based on use dataset of NLP Interchange Format (NIF).
14
+ # For example datasets from http://wiki.dbpedia.org/nif-abstract-datasets
15
+ # Knowledge is generate by information in DBpedia.
16
+ class Statistic
17
+ include BrowserWebData::EntitySumarizationConfig
18
+
19
+ attr_reader :nif_file_path, :results_dir_path
20
+
21
+ ###
22
+ # Create new instance.
23
+ #
24
+ # @param [String] nif_dataset_path
25
+ # @param [String] results_dir_path
26
+ # @param [TrueClass, FalseClass] console_output Allow puts info to console. Default is false.
27
+ def initialize(nif_dataset_path, results_dir_path = File.join(__dir__, '../../results'), console_output = false)
28
+ nif_dataset_path = nif_dataset_path.gsub('\\', '/')
29
+ results_dir_path = results_dir_path.gsub('\\', '/').chomp('/')
30
+
31
+ return false unless File.exists?(nif_dataset_path)
32
+ return false unless File.exists?(results_dir_path)
33
+
34
+ @nif_file_path = nif_dataset_path.gsub('\\', '/')
35
+ @results_dir_path = results_dir_path.gsub('\\', '/').chomp('/')
36
+ @console_output = console_output
37
+
38
+ @query = SPARQLRequest.new
39
+ @predicates_similarity = PredicatesSimilarity.new(@results_dir_path)
40
+ end
41
+
42
+ ###
43
+ # The method find resource links in given nif file dataset.
44
+ #
45
+ # @param [Hash] params
46
+ # @option params [Array<String>, String] :entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
47
+ # @option params [Fixnum] :entity_count Best ranked resources by every entity type.
48
+ # @option params [Fixnum] :best_score_count Count of result predicates to keep.
49
+ # @option params [FalseClass, TruesClass] :demand_reload
50
+ # @option params [FalseClass, TruesClass] :identity_identical_predicates
51
+ def create_by_nif_dataset(params)
52
+ params[:entity_types] = [params[:entity_types]] unless params[:entity_types].is_a?(Array)
53
+
54
+ generate_statistics_from_nif(params[:entity_types], params[:entity_count], params[:demand_reload])
55
+
56
+ params[:entity_types].each { |type|
57
+ generate_knowledge_base(type, params[:best_score_count], params[:identity_identical_predicates])
58
+ }
59
+ end
60
+
61
+ ###
62
+ # The method return list of best ranked resources by required entity types.
63
+ #
64
+ # @param [Array<String>, String] entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
65
+ # @param [Fixnum] count Count of best ranked resources
66
+ #
67
+ # @return [Hash] resources
68
+ def get_best_ranked_resources(entity_types, count = 10)
69
+ resources = {}
70
+ entity_types = [entity_types] unless entity_types.is_a?(Array)
71
+
72
+ entity_types.each { |type|
73
+ top_ranked_entities = @query.get_resources_by_dbpedia_page_rank(type, count)
74
+
75
+ top_ranked_entities.each { |solution|
76
+ resources[solution.entity.value] = {type: type, rank: solution.rank.value.to_f}
77
+ }
78
+ }
79
+
80
+ resources
81
+ end
82
+
83
+ ###
84
+ # The method find links in given nif dataset. After find collect relations #find_relations.
85
+ # For each resource generate file in @results_dir_path.
86
+ #
87
+ # @param [Array<String>, String] entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
88
+ # @param [Fixnum] count Count of best ranked resources
89
+ # @param [FalseClass, TruesClass] demand_reload
90
+ def generate_statistics_from_nif(entity_types, count = 10, demand_reload = false)
91
+ resources = get_best_ranked_resources(entity_types, count)
92
+
93
+ resources = keep_unloaded(resources) unless demand_reload
94
+
95
+ actual_resource_data = []
96
+ lines_group = []
97
+
98
+ begin
99
+ time_start = Time.now
100
+ nif_file = File.open(@nif_file_path, 'r')
101
+ line = nif_file.readline
102
+
103
+ until nif_file.eof?
104
+ line = nif_file.readline
105
+
106
+ if lines_group.size == 7
107
+ # evaulate group (7 lines)
108
+ this_resource_uri = NIFLineParser.parse_resource_uri(lines_group[0])
109
+
110
+ if resources.keys.include?(this_resource_uri)
111
+ # process group, is requested
112
+ resource_uri = this_resource_uri
113
+ actual_resource_data << NIFLineParser.parse_line_group(lines_group)
114
+
115
+ elsif !actual_resource_data.empty?
116
+ # resource changed, process actual_resource_data
117
+ resource_hash = resources.delete(resource_uri)
118
+ type = resource_hash[:type]
119
+
120
+ this_time = (Time.now - time_start).round(2)
121
+ puts "\n#{resource_uri}\n- nif found in #{this_time}\n- resources to find #{resources.size}" if @console_output
122
+
123
+ result_relations = find_relations(resource_uri, actual_resource_data, type)
124
+ generate_result_file(resource_uri, type, result_relations, this_time)
125
+
126
+ break if resources.empty?
127
+
128
+ actual_resource_data = []
129
+ time_start = Time.now
130
+ end
131
+
132
+ # start new group
133
+ lines_group = [line]
134
+ else
135
+
136
+ # join line to group
137
+ lines_group << line
138
+ end
139
+
140
+ end
141
+
142
+ ensure
143
+ nif_file.close if nif_file && !nif_file.closed?
144
+ end
145
+ end
146
+
147
+ ###
148
+ # The method helps to recollect relations by already generated result files.
149
+ #
150
+ # @param [Array<String>, String] entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
151
+ # @param [Fixnum] count Count of best ranked resources
152
+ def refresh_statistics_in_files(entity_types, count = 10)
153
+ resources = get_best_ranked_resources(entity_types, count)
154
+
155
+ resources = keep_loaded(resources)
156
+
157
+ resources.each { |resource_uri, resource_info|
158
+ puts "_____ #{resource_uri} _____" if @console_output
159
+
160
+ update_nif_file_properties(resource_uri, resource_info[:type]) { |link|
161
+ get_predicates_by_link(resource_uri, link, resource_info[:type])
162
+ }
163
+ }
164
+
165
+ end
166
+
167
+
168
+ ###
169
+ # The method find predicates by given link.
170
+ # Find strict predicates that are in relation: <resource> ?predicate <link> .
171
+ # Find predicates that are in relation: ?subject a <type> . ?subject ?predicate <link>
172
+ #
173
+ # @param [String] resource_uri Resource for which will be find strict properties
174
+ # @param [String] link Link that has some importance to resource or entity type.
175
+ # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
176
+ #
177
+ # @return [Hash] result
178
+ def get_predicates_by_link(resource_uri, link, type)
179
+ properties = {type => {}}
180
+ strict_properties = {type => {}}
181
+
182
+ @query.get_all_predicates_by_subject_object(resource_uri, link).each { |solution|
183
+ predicate = solution.to_h
184
+ property = predicate[:property].to_s.force_encoding('utf-8')
185
+
186
+ next if Predicate.unimportant?(property)
187
+
188
+ count = @query.get_count_predicate_by_entity(type, property)[0].to_h[:count].to_f
189
+ strict_properties[type][property] = count if count > 0
190
+ }
191
+
192
+ @query.get_all_predicates_by_object(link).each { |solution|
193
+ predicate = solution.to_h
194
+ property = predicate[:property].to_s.force_encoding('utf-8')
195
+
196
+ next if Predicate.unimportant?(property) || strict_properties[type][property]
197
+
198
+ count = @query.get_count_predicate_by_entity(type, property)[0].to_h[:count].to_f
199
+ properties[type][property] = count if count > 0
200
+ }
201
+
202
+
203
+ {properties: properties, strict_properties: strict_properties}
204
+ end
205
+
206
+ ###
207
+ # The method helps to store founded information from nif for given resource.
208
+ #
209
+ # @param [String] resource_uri
210
+ # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
211
+ # @param [Hsah] result_relations Hash generated by method #find_relations
212
+ # @option result_relations [Hash] :sections Contains key 'section_type' value 'position'
213
+ # @option result_relations [Array<Hash>] :relations Hashes generated by method #get_predicates_by_link
214
+ #
215
+ # @param [Float] this_time Relative time of find in nif dataset.
216
+ def generate_result_file(resource_uri, type, result_relations, this_time)
217
+ section_degradation = result_relations[:sections].map { |section_type, position|
218
+ index = result_relations[:sections].keys.index(section_type)
219
+
220
+ # recognize value of degradation by relative position paragraphs in document
221
+ position[:degradation] = 1 - ((index / result_relations[:sections].size) / 10.0)
222
+
223
+ {section_type => position}
224
+ }.reduce(:merge)
225
+
226
+ total_size = section_degradation.max_by { |_, v| v[:to] }[1][:to].to_f
227
+
228
+ result_nif_data = result_relations[:relations].map { |relation|
229
+ paragraph_position = section_degradation[relation[:section]]
230
+
231
+ # weight is lowest by relative distance from document start
232
+ position_weight = (1 - ((relation[:indexes][0].to_i) / total_size))
233
+ # weight is also degraded by index of paragraph
234
+ relation[:weight] = (position_weight * paragraph_position[:degradation]).round(4)
235
+
236
+ relation
237
+ }
238
+
239
+ result = {
240
+ process_time: {nif_find: this_time, relations_find: result_relations[:time]},
241
+ resource_uri: resource_uri,
242
+ nif_data: result_nif_data
243
+ }
244
+
245
+ result_path = get_resource_file_path(resource_uri, type)
246
+ File.open(result_path, 'w:utf-8') { |f| f << JSON.pretty_generate(result) }
247
+ end
248
+
249
+ ###
250
+ # The method process all generated result files from nif dataset (by entity class type)
251
+ # to one result knowledge base file.
252
+ #
253
+ # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
254
+ # @param [Fixnum] best_count Define max count of properties that will be assign to entity class type.
255
+ # @param [TrueClass, FalseClass] identify_identical Flag for process identify and group identical properties as one item.
256
+ def generate_knowledge_base(type, best_count = 20, identify_identical = true)
257
+ puts "_____ #{type} _____" if @console_output
258
+ files = Dir.glob("#{@results_dir_path}/#{type}/*.json")
259
+ type = type.to_s.to_sym
260
+
261
+ knowledge_data = {type => []}
262
+
263
+ files.each { |file_path|
264
+ puts "- calculate #{file_path}" if @console_output
265
+ file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
266
+
267
+ if identify_identical
268
+ file_data[:nif_data].each { |data|
269
+ all_properties = data[:properties][type].keys + ((data[:strict_properties]||{})[type] || {}).keys.uniq
270
+ @predicates_similarity.identify_identical_predicates(all_properties)
271
+ }
272
+ end
273
+
274
+ file_data[:nif_data].each { |found|
275
+
276
+ properties = found[:properties][type.to_sym]
277
+ strict_properties = (found[:strict_properties] ||{})[type] || {}
278
+ weight = found[:weight]
279
+
280
+ strict_properties.each { |property, count|
281
+ property = property.to_s
282
+ value = count.to_i * weight
283
+
284
+ prepare_property_to_knowledge(property, knowledge_data[type]) { |from_knowledge|
285
+ old_score = from_knowledge[:score] * from_knowledge[:counter]
286
+ from_knowledge[:counter] += 1
287
+ (old_score + value) / from_knowledge[:counter]
288
+ }
289
+ }
290
+
291
+ properties.each { |property, count|
292
+ property = property.to_s
293
+ value = count.to_i * weight
294
+
295
+ prepare_property_to_knowledge(property, knowledge_data[type]) { |from_knowledge|
296
+ old_score = from_knowledge[:score] * from_knowledge[:counter]
297
+ from_knowledge[:counter] += 1
298
+ (old_score + value) / from_knowledge[:counter]
299
+ }
300
+ }
301
+ }
302
+
303
+ unless knowledge_data[type].empty?
304
+ max_weight = knowledge_data[type].max_by { |data| data[:score] }[:score]
305
+ knowledge_data[type] = knowledge_data[type].map { |hash|
306
+ hash[:score] = (hash[:score] / max_weight).round(4)
307
+ hash
308
+ }
309
+ end
310
+ }
311
+
312
+ global_properties = get_global_statistic_by_type(type) || {}
313
+ if identify_identical
314
+ @predicates_similarity.identify_identical_predicates(global_properties.keys)
315
+ end
316
+
317
+ if global_properties.size > 0
318
+ max_count = global_properties.max_by { |_, count| count }[1].to_f
319
+ global_properties.each { |property, count|
320
+
321
+ value = count / max_count
322
+
323
+ prepare_property_to_knowledge(property, knowledge_data[type]) { |from_knowledge|
324
+ from_knowledge[:score] > 0 ? ((from_knowledge[:score] + value) / 2.0).round(4) : value.round(4)
325
+ }
326
+ }
327
+ end
328
+
329
+ knowledge_data[type].map! { |hash|
330
+ hash.delete(:counter)
331
+ hash
332
+ }
333
+
334
+ knowledge_data[type] = knowledge_data[type].sort_by { |hash| hash[:score] }.reverse.take(best_count)
335
+
336
+ if identify_identical
337
+ @predicates_similarity.reduce_identical
338
+ end
339
+
340
+ update_knowledge_base(knowledge_data)
341
+ end
342
+
343
+ ###
344
+ # The method generate simple statistics that contain all predicates that links to literal.
345
+ # Predicates are grouped by entity class type and also contains count of total occurrence.
346
+ # Predicates find from best ranked resources.
347
+ #
348
+ # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
349
+ # @param [Fixnum] count Count of best ranked resources
350
+ def generate_literal_statistics(type = nil, count = 10)
351
+ unless type
352
+ type = get_all_classes
353
+ end
354
+
355
+ type = [type] unless type.is_a?(Array)
356
+
357
+ type.each_with_index { |entity_type, index|
358
+ all_properties = {}
359
+ puts "#{__method__} - start process entity type: #{entity_type} [#{(index / type.size.to_f).round(2)}]" if @console_output
360
+ entity_type = entity_type.to_s.to_sym
361
+
362
+ get_best_ranked_resources(entity_type, count).each { |resource, _|
363
+ properties = @query.get_all_predicates_by_subject(resource.to_s, true).map { |solution_prop|
364
+ solution_prop[:property].to_s
365
+ } || []
366
+
367
+ properties.uniq.each { |prop|
368
+ next if Predicate.unimportant?(prop)
369
+ all_properties[entity_type] ||= {}
370
+ all_properties[entity_type][prop] ||= 0
371
+ all_properties[entity_type][prop] += 1
372
+ }
373
+
374
+ }
375
+
376
+ update_global_statistic(all_properties)
377
+ }
378
+ end
379
+
380
+ ###
381
+ # The method load all defined entity class types by http://mappings.dbpedia.org/server/ontology/classes/
382
+ #
383
+ # @param [String] path
384
+ #
385
+ # @return [Hash] classes
386
+ def get_all_classes(path = File.join(__dir__,'../knowledge/classes_hierarchy.json'))
387
+ data = ensure_load_json(path, {})
388
+ HashHelper.recursive_map_keys(data)
389
+ end
390
+
391
+
392
+ private
393
+
394
+ ###
395
+ # The method helps to continue of process find links in nif dataset.
396
+ #
397
+ # @param [String] resource_uri
398
+ # @param [Hash] actual_resource_data Part data extracted from nif dataset for given resource_uri
399
+ # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
400
+ #
401
+ # @return [Hash] out
402
+ def find_relations(resource_uri, actual_resource_data, type)
403
+ out = {
404
+ sections: {},
405
+ relations: []
406
+ }
407
+
408
+ puts "- properties to find size[#{actual_resource_data.size}]" if @console_output
409
+
410
+ time = Benchmark.realtime {
411
+ out[:relations] = actual_resource_data.map! { |resource_data|
412
+ section_group = resource_data[:section].scan(SCAN_REGEXP[:group])
413
+
414
+ type_key = resource_data[:section].force_encoding('utf-8')
415
+
416
+ out[:sections][type_key] ||= {
417
+ type: section_group[0][0],
418
+ from: section_group[0][1].to_i,
419
+ to: section_group[0][2].to_i,
420
+ }
421
+
422
+ result = get_predicates_by_link(resource_uri, resource_data[:link], type)
423
+
424
+ resource_data[:properties] = result[:properties]
425
+ resource_data[:strict_properties] = result[:strict_properties]
426
+
427
+ resource_data
428
+ }.compact || []
429
+ }
430
+
431
+ out[:time] = time.round(2)
432
+
433
+ puts "- properties found in #{out[:time]}" if @console_output
434
+
435
+ out
436
+ end
437
+
438
+
439
+ ###
440
+ # The method generate file path for given resource URI.
441
+ # Also ensure to exist sub directory by resource entity type.
442
+ #
443
+ # @param [String] resource_uri
444
+ # @param [String] type
445
+ #
446
+ # @return [String] resource_file_path
447
+ def get_resource_file_path(resource_uri, type)
448
+ type = type.split('/').last
449
+ resource_name = resource_uri.split('/').last
450
+
451
+ dir_path = "#{@results_dir_path}/#{type}"
452
+ Dir.mkdir(dir_path) unless Dir.exist?(dir_path)
453
+
454
+ "#{dir_path}/#{StringHelper.get_clear_file_path(resource_name)}.json"
455
+ end
456
+
457
+ ###
458
+ # The method helps update found predicates to stored links.
459
+ #
460
+ # @param [String] resource_uri
461
+ # @param [String] type
462
+ #
463
+ # @return [Array<Hash>] old_data, new_data
464
+ def update_nif_file_properties(resource_uri, type)
465
+ if block_given?
466
+ path = get_resource_file_path(resource_uri, type)
467
+ old_data = ensure_load_json(path, {}, symbolize_names: true)
468
+
469
+ new_data = old_data.dup
470
+
471
+ time = Benchmark.realtime {
472
+ new_data[:nif_data] = old_data[:nif_data].map { |hash|
473
+ actual_link = hash[:link].to_sym
474
+
475
+ result = yield actual_link
476
+
477
+ hash[:strict_properties] = result[:strict_properties] if result[:strict_properties]
478
+ hash[:properties] = result[:properties] if result[:properties]
479
+
480
+ hash
481
+ }
482
+ }
483
+
484
+ new_data[:process_time][:relations_find] = time.round(2)
485
+
486
+ File.write(path, JSON.pretty_generate(new_data))
487
+ return old_data, new_data
488
+ end
489
+ end
490
+
491
+ ###
492
+ # The method in yield block give founded hash for required property.
493
+ # This hash contains counter, score and also all identical properties.
494
+ # At the end update score that was get from yield block as return value.
495
+ #
496
+ # @param [String] property
497
+ # @param [Array<Hash>] this_knowledge_data
498
+ #
499
+ # @yield param found
500
+ # @yield return score
501
+ def prepare_property_to_knowledge(property, this_knowledge_data)
502
+ property = property.to_s
503
+
504
+ this_knowledge_data ||= []
505
+ found = this_knowledge_data.find { |data| data[:predicates].include?(property) }
506
+
507
+ if found.nil? || found.empty?
508
+ # add new
509
+
510
+ identical_properties = @predicates_similarity.find_identical(property)
511
+
512
+ found = {
513
+ counter: 0,
514
+ score: 0.0,
515
+ predicates: identical_properties || [property.to_s]
516
+ }
517
+
518
+ this_knowledge_data << found
519
+ end
520
+
521
+ new_score = yield found
522
+
523
+
524
+ found[:score] = new_score
525
+ end
526
+
527
+ ###
528
+ # The method delete all resources that already has created result file
529
+ #
530
+ # @param [Hash{resource=>type}] resources
531
+ def keep_unloaded(resources)
532
+ resources.delete_if { |resource, values|
533
+ dir_path = "#{@results_dir_path}/#{values[:type]}"
534
+ resource_name = resource.split('/').last
535
+ File.exists?("#{dir_path}/#{StringHelper.get_clear_file_path(resource_name)}.json")
536
+ }
537
+ end
538
+
539
+ ###
540
+ # The method keep all resources that already has created result file
541
+ #
542
+ # @param [Hash{resource=>type}] resources
543
+ def keep_loaded(resources)
544
+ resources.keep_if { |resource, values|
545
+ dir_path = "#{@results_dir_path}/#{values[:type]}"
546
+ resource_name = resource.split('/').last
547
+ File.exists?("#{dir_path}/#{StringHelper.get_clear_file_path(resource_name)}.json")
548
+ }
549
+ end
550
+
551
+ ###
552
+ # The method allow to update knowledge base by every entity class type.
553
+ #
554
+ # @param [Hash] new_data
555
+ def update_knowledge_base(new_data)
556
+ path = "#{@results_dir_path}/knowledge_base.json"
557
+ old_data = ensure_load_json(path, {}, symbolize_names: true)
558
+ File.write(path, JSON.pretty_generate(old_data.merge(new_data)))
559
+ end
560
+
561
+ ###
562
+ # The method allow to update global statistic by every entity class type.
563
+ #
564
+ # @param [Hash] new_data
565
+ def update_global_statistic(new_data)
566
+ path = "#{@results_dir_path}/global_statistic.json"
567
+ old_data = ensure_load_json(path, {}, symbolize_names: true)
568
+ File.write(path, JSON.pretty_generate(old_data.merge(new_data)))
569
+ end
570
+
571
+ ###
572
+ # The method returns global properties for given entity class type.
573
+ #
574
+ # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
575
+ #
576
+ # @return [Hash] global_statistic_by_type
577
+ def get_global_statistic_by_type(type)
578
+ type = type.to_s.to_sym
579
+ path = "#{@results_dir_path}/global_statistic.json"
580
+ data = ensure_load_json(path, {}, symbolize_names: true)
581
+ data[type]
582
+ end
583
+
584
+ ###
585
+ # The method helps to load json file.
586
+ #
587
+ # @param [String] file_path
588
+ # @param [String] def_val If no exist file add values as default.
589
+ # @param [Hash] json_params JSON.parse params
590
+ #
591
+ # @return [Object] json
592
+ def ensure_load_json(file_path, def_val, json_params = {})
593
+ if File.exists?(file_path)
594
+ file_data = File.read(file_path).force_encoding('utf-8')
595
+ if file_data.size >= 2 # '[]'
596
+ JSON.parse(file_data, json_params)
597
+ else
598
+ def_val
599
+ end
600
+ else
601
+ def_val
602
+ end
603
+ end
604
+
605
+
606
+ end
607
+ end
608
+
609
+ end