browser_web_data_entity_sumarization 1.0.0beta1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b24bd4b2cd4ae8c908f6549710531303a49c36e5
4
+ data.tar.gz: 0c48cc2e23703a28dbe262c3535703b87698f63f
5
+ SHA512:
6
+ metadata.gz: 786bfb65d8250af636e9a1187cc7530738e734252d405c53bc10c0f0c03ba0ea4953fca5b32613817c50c4c67200df09ea31ff96f0a2ec8f0c481b53a965e00b
7
+ data.tar.gz: 3448f20b4aa7b7bd457da33479435fd557ef14c87f13264cf3a99286ecd053511cbf341f0ae2f1cb75a516fe5bf4d9e73bd6a0e46b7de701d93d1938f6851c12
@@ -0,0 +1,23 @@
1
+ #encoding: utf-8
2
+ require 'sparql/client'
3
+ require 'benchmark'
4
+ require 'json'
5
+
6
+ module BrowserWebData
7
+ module EntitySumarization
8
+
9
+ end
10
+ end
11
+
12
+ Dir.glob(File.dirname(__FILE__) + '/utils/*.rb').each { |file| require file }
13
+ Dir.glob(File.dirname(__FILE__) + '/config/*.rb').each { |file| require file }
14
+
15
+ # Require all gem scripts by their relative names
16
+ Dir[File.dirname(__FILE__) + '/browser_web_data_entity_sumarization/**/*.rb'].each do |file|
17
+ require(file.gsub('\\', '/').split('/lib/').last[0..-4])
18
+ end
19
+
20
+
21
+
22
+
23
+
@@ -0,0 +1,59 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # Core project module
5
+ module BrowserWebData
6
+
7
+ ###
8
+ # Project logic module
9
+ module EntitySumarization
10
+
11
+ ###
12
+ # The class include helpers to retrieve structured nif data from nif lines.
13
+ class NIFLineParser
14
+ include BrowserWebData::EntitySumarizationConfig
15
+
16
+ ###
17
+ # The method apply scan to recognize resource uri from given nif dataset line.
18
+ #
19
+ # @param [String] line
20
+ #
21
+ # @return [String] resource_uri
22
+ # @example resource_uri: "http://dbpedia.org/resource/Captain_EO"
23
+ def self.parse_resource_uri(line)
24
+ (line.scan(SCAN_REGEXP[:scan_resource])[0])[0].split('?').first
25
+ end
26
+
27
+ ###
28
+ # The method apply scan to recognize link, anchor, indexes and section from given nif dataset group of 7 lines.
29
+ #
30
+ # @param [Array<String>] lines_group
31
+ #
32
+ # @return [Hash] nif_data
33
+ # @example nif_data:
34
+ # {
35
+ # link: "http://dbpedia.org/resource/Science_fiction_film",
36
+ # anchor: "science fiction film",
37
+ # indexes: ["33", "53"],
38
+ # section: "paragraph_0_419"
39
+ # }
40
+ def self.parse_line_group(lines_group)
41
+ begin_index = lines_group[2].scan(SCAN_REGEXP[:begin_index])[0]
42
+ end_index = lines_group[3].scan(SCAN_REGEXP[:end_index])[0]
43
+ target_resource_link = lines_group[5].scan(SCAN_REGEXP[:target_resource_link])[0]
44
+ section = lines_group[4].scan(SCAN_REGEXP[:section])[0]
45
+ anchor = lines_group[6].scan(SCAN_REGEXP[:anchor])[0]
46
+
47
+ {
48
+ link: target_resource_link[1].force_encoding('utf-8'),
49
+ anchor: anchor[1].force_encoding('utf-8'),
50
+ indexes: [begin_index[1], end_index[1]],
51
+ section: section[0].split('=')[1]
52
+ }
53
+ end
54
+
55
+ end
56
+
57
+ end
58
+
59
+ end
@@ -0,0 +1,33 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # Core project module
5
+ module BrowserWebData
6
+
7
+ ###
8
+ # Project logic module
9
+ module EntitySumarization
10
+
11
+
12
+ ###
13
+ # The class include helper methods.
14
+ # (todo definition of predicate instance)
15
+ class Predicate
16
+ include BrowserWebData::EntitySumarizationConfig
17
+
18
+ ###
19
+ # The method helps identify unimportant predicate by constants.
20
+ #
21
+ # @param [String] property
22
+ #
23
+ # @return [TrueClass, FalseClass] result
24
+ def self.unimportant?(property)
25
+ property = property.to_s
26
+ NO_SENSE_PROPERTIES.include?(property) || COMMON_PROPERTIES.include?(property)
27
+ end
28
+
29
+ end
30
+
31
+ end
32
+
33
+ end
@@ -0,0 +1,263 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # Core project module
5
+ module BrowserWebData
6
+
7
+ ###
8
+ # Project logic module
9
+ module EntitySumarization
10
+
11
+ ###
12
+ # The class include methods to identify identical predicates
13
+ class PredicatesSimilarity
14
+ include BrowserWebData::EntitySumarizationConfig
15
+
16
+ def initialize(results_dir_path, identical_limit = IDENTICAL_PROPERTY_LIMIT, console_output = false)
17
+ @results_dir_path = results_dir_path
18
+ @console_output = console_output
19
+ @identical_limit = identical_limit
20
+
21
+ @query = SPARQLRequest.new
22
+
23
+ load_identical_predicates
24
+ load_different_predicates
25
+ end
26
+
27
+ ###
28
+ # The method return key of identical predicates
29
+ #
30
+ # @param [Array<String>] predicates
31
+ #
32
+ # @return [String] key
33
+ def self.get_key(predicates)
34
+ predicates = [predicates] unless predicates.is_a?(Array)
35
+ "<#{predicates.join('><')}>" if predicates && !predicates.empty?
36
+ end
37
+
38
+ ###
39
+ # The method return identical predicates by key
40
+ #
41
+ # @param [String] key
42
+ #
43
+ # @return [Array<String>] predicates
44
+ def self.parse_key(key)
45
+ key.to_s.scan(SCAN_REGEXP[:identical_key]).reduce(:+)
46
+ end
47
+
48
+ ###
49
+ # The method verify every combination of two predicates.
50
+ # Method store identify combination in two files identical_predicates.json and different_predicates.json
51
+ # files contains Array of combination keys
52
+ #
53
+ # @param [Array<String>] predicates
54
+ def identify_identical_predicates(predicates, identical_limit = @identical_limit)
55
+ @temp_counts ||= {}
56
+
57
+ predicates.combination(2).each { |values|
58
+
59
+ already_mark_same = find_identical(values)
60
+ already_mark_different = find_different(values)
61
+
62
+ if already_mark_same.nil? && already_mark_different.nil?
63
+
64
+ # in case of dbpedia ontology vs. property
65
+ # automatically became identical
66
+ unless try_auto_identical(values)
67
+
68
+ unless @temp_counts[values[0]]
69
+ @temp_counts[values[0]] = @query.get_count_of_identical_predicates(values[0])
70
+ end
71
+
72
+ unless @temp_counts[values[1]]
73
+ @temp_counts[values[1]] = @query.get_count_of_identical_predicates(values[1])
74
+ end
75
+
76
+ x = @temp_counts[values[0]]
77
+ y = @temp_counts[values[1]]
78
+ z = @query.get_count_of_identical_predicates(values)
79
+
80
+ identical_level = z / [x, y].max
81
+
82
+ if identical_level >= identical_limit
83
+ puts " - result[#{identical_level}] z[#{z}] x[#{x}] y[#{y}] #{values.inspect}" if @console_output
84
+ add_identical(values)
85
+ else
86
+ add_different(values)
87
+ end
88
+ end
89
+
90
+ end
91
+
92
+ true
93
+ }
94
+
95
+ end
96
+
97
+ ###
98
+ # The method helps to recognize if is already marked as identical properties
99
+ #
100
+ # @param [Array<String>, String] value
101
+ #
102
+ # @return [String, NilClass]
103
+ def find_identical(value)
104
+ raise RuntimeError.new('No support identify identical for more than 2 predicates.') if value.is_a?(Array) && value.size >2
105
+
106
+ predicates_key = case value
107
+ when Array
108
+ value = value.map { |v| PredicatesSimilarity.get_key(v) }
109
+ @identical_predicates.find { |p|
110
+ p[value[0]] && p[value[1]]
111
+ }
112
+ else
113
+ value = PredicatesSimilarity.get_key(value)
114
+ @identical_predicates.find { |p|
115
+ p[value]
116
+ }
117
+ end
118
+
119
+ PredicatesSimilarity.parse_key(predicates_key)
120
+ end
121
+
122
+ ###
123
+ # The method helps to recognize if is already marked as different properties
124
+ #
125
+ # @param [Array<String>, String] value
126
+ #
127
+ # @return [String, NilClass]
128
+ def find_different(value)
129
+ raise RuntimeError.new('No support identify identical for more than 2 predicates.') if value.is_a?(Array) && value.size >2
130
+
131
+ key = case value
132
+ when Array
133
+ value = value.map { |v| PredicatesSimilarity.get_key(v) }
134
+ @different_predicates.find { |p| p[value[0]] && p[value[1]] }
135
+ else
136
+ value = PredicatesSimilarity.get_key(value)
137
+ @different_predicates.find { |p| p[value] }
138
+ end
139
+
140
+ PredicatesSimilarity.parse_key(key)
141
+ end
142
+
143
+ def add_identical(values)
144
+ values = values.map { |p| p.to_s }.uniq.sort
145
+ group_key = PredicatesSimilarity.get_key(values)
146
+
147
+ unless @identical_predicates.include?(group_key)
148
+ @identical_predicates << group_key
149
+ store_identical_properties
150
+ end
151
+ end
152
+
153
+ def add_different(values)
154
+ values = values.map { |p| p.to_s }.uniq.sort
155
+ group_key = PredicatesSimilarity.get_key(values)
156
+
157
+ unless @different_predicates.include?(group_key)
158
+ @different_predicates << group_key
159
+
160
+ @new_diff_counter ||= 0
161
+ @new_diff_counter += 1
162
+
163
+ if @new_diff_counter > 100
164
+ store_different_predicates
165
+ @new_diff_counter = 0
166
+ end
167
+
168
+ end
169
+ end
170
+
171
+ def try_auto_identical(values)
172
+ group_key = PredicatesSimilarity.get_key(values)
173
+
174
+ temp = values.map { |val| val.to_s.split('/').last }.uniq
175
+ if temp.size == 1 && group_key['property/'] && group_key['ontology/']
176
+ add_identical(values)
177
+ true
178
+ else
179
+ false
180
+ end
181
+ end
182
+
183
+
184
+ ###
185
+ # The method helps to reduce identical predicates by join of common predicate
186
+ def reduce_identical
187
+ new_identical = []
188
+
189
+ @identical_predicates.each { |key|
190
+ values = PredicatesSimilarity.parse_key(key)
191
+ next if new_identical.find { |v| !(v & values).empty? }
192
+
193
+ ## find nodes with values predicates
194
+ values = recursive_find_identical(key, values)
195
+
196
+ new_identical << values.uniq.sort
197
+ }
198
+
199
+ @identical_predicates = new_identical.map { |v| PredicatesSimilarity.get_key(v) }
200
+
201
+ store_identical_properties
202
+ end
203
+
204
+ def recursive_find_identical(keys, values)
205
+ keys = [keys] unless keys.is_a?(Array)
206
+
207
+ @identical_predicates.each { |this_key|
208
+ next if keys.include?(this_key)
209
+ temp = PredicatesSimilarity.parse_key(this_key)
210
+
211
+ unless (temp & values).empty?
212
+ keys << this_key
213
+ return recursive_find_identical(keys, (values + temp).uniq)
214
+ end
215
+ }
216
+
217
+ values
218
+ end
219
+
220
+
221
+ private
222
+
223
+
224
+ def load_identical_predicates
225
+ unless @identical_predicates
226
+ file_path = "#{@results_dir_path}/identical_predicates.json"
227
+ @identical_predicates = ensure_load_json(file_path, [])
228
+ end
229
+ end
230
+
231
+ def load_different_predicates
232
+ unless @different_predicates
233
+ file_path = "#{@results_dir_path}/different_predicates.json"
234
+ @different_predicates = ensure_load_json(file_path, [])
235
+ end
236
+ end
237
+
238
+ def store_identical_properties
239
+ File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
240
+ end
241
+
242
+ def store_different_predicates
243
+ File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
244
+ end
245
+
246
+ def ensure_load_json(file_path, def_val, json_params = {})
247
+ if File.exists?(file_path)
248
+ file_data = File.read(file_path).force_encoding('utf-8')
249
+ if file_data.size >= 2 # '[]'
250
+ JSON.parse(file_data, json_params)
251
+ else
252
+ def_val
253
+ end
254
+ else
255
+ def_val
256
+ end
257
+ end
258
+
259
+ end
260
+
261
+ end
262
+
263
+ end
@@ -0,0 +1,609 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # Core project module
5
+ module BrowserWebData
6
+
7
+ ###
8
+ # Project logic module
9
+ module EntitySumarization
10
+
11
+ ###
12
+ # Statistic class allow to find, collect and generate knowledge of entity sumarization.
13
+ # Entity sumarization is based on use dataset of NLP Interchange Format (NIF).
14
+ # For example datasets from http://wiki.dbpedia.org/nif-abstract-datasets
15
+ # Knowledge is generate by information in DBpedia.
16
+ class Statistic
17
+ include BrowserWebData::EntitySumarizationConfig
18
+
19
+ attr_reader :nif_file_path, :results_dir_path
20
+
21
+ ###
22
+ # Create new instance.
23
+ #
24
+ # @param [String] nif_dataset_path
25
+ # @param [String] results_dir_path
26
+ # @param [TrueClass, FalseClass] console_output Allow puts info to console. Default is false.
27
+ def initialize(nif_dataset_path, results_dir_path = File.join(__dir__, '../../results'), console_output = false)
28
+ nif_dataset_path = nif_dataset_path.gsub('\\', '/')
29
+ results_dir_path = results_dir_path.gsub('\\', '/').chomp('/')
30
+
31
+ return false unless File.exists?(nif_dataset_path)
32
+ return false unless File.exists?(results_dir_path)
33
+
34
+ @nif_file_path = nif_dataset_path.gsub('\\', '/')
35
+ @results_dir_path = results_dir_path.gsub('\\', '/').chomp('/')
36
+ @console_output = console_output
37
+
38
+ @query = SPARQLRequest.new
39
+ @predicates_similarity = PredicatesSimilarity.new(@results_dir_path)
40
+ end
41
+
42
+ ###
43
+ # The method find resource links in given nif file dataset.
44
+ #
45
+ # @param [Hash] params
46
+ # @option params [Array<String>, String] :entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
47
+ # @option params [Fixnum] :entity_count Best ranked resources by every entity type.
48
+ # @option params [Fixnum] :best_score_count Count of result predicates to keep.
49
+ # @option params [FalseClass, TruesClass] :demand_reload
50
+ # @option params [FalseClass, TruesClass] :identity_identical_predicates
51
+ def create_by_nif_dataset(params)
52
+ params[:entity_types] = [params[:entity_types]] unless params[:entity_types].is_a?(Array)
53
+
54
+ generate_statistics_from_nif(params[:entity_types], params[:entity_count], params[:demand_reload])
55
+
56
+ params[:entity_types].each { |type|
57
+ generate_knowledge_base(type, params[:best_score_count], params[:identity_identical_predicates])
58
+ }
59
+ end
60
+
61
+ ###
62
+ # The method return list of best ranked resources by required entity types.
63
+ #
64
+ # @param [Array<String>, String] entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
65
+ # @param [Fixnum] count Count of best ranked resources
66
+ #
67
+ # @return [Hash] resources
68
+ def get_best_ranked_resources(entity_types, count = 10)
69
+ resources = {}
70
+ entity_types = [entity_types] unless entity_types.is_a?(Array)
71
+
72
+ entity_types.each { |type|
73
+ top_ranked_entities = @query.get_resources_by_dbpedia_page_rank(type, count)
74
+
75
+ top_ranked_entities.each { |solution|
76
+ resources[solution.entity.value] = {type: type, rank: solution.rank.value.to_f}
77
+ }
78
+ }
79
+
80
+ resources
81
+ end
82
+
83
+ ###
84
+ # The method find links in given nif dataset. After find collect relations #find_relations.
85
+ # For each resource generate file in @results_dir_path.
86
+ #
87
+ # @param [Array<String>, String] entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
88
+ # @param [Fixnum] count Count of best ranked resources
89
+ # @param [FalseClass, TruesClass] demand_reload
90
+ def generate_statistics_from_nif(entity_types, count = 10, demand_reload = false)
91
+ resources = get_best_ranked_resources(entity_types, count)
92
+
93
+ resources = keep_unloaded(resources) unless demand_reload
94
+
95
+ actual_resource_data = []
96
+ lines_group = []
97
+
98
+ begin
99
+ time_start = Time.now
100
+ nif_file = File.open(@nif_file_path, 'r')
101
+ line = nif_file.readline
102
+
103
+ until nif_file.eof?
104
+ line = nif_file.readline
105
+
106
+ if lines_group.size == 7
107
+ # evaulate group (7 lines)
108
+ this_resource_uri = NIFLineParser.parse_resource_uri(lines_group[0])
109
+
110
+ if resources.keys.include?(this_resource_uri)
111
+ # process group, is requested
112
+ resource_uri = this_resource_uri
113
+ actual_resource_data << NIFLineParser.parse_line_group(lines_group)
114
+
115
+ elsif !actual_resource_data.empty?
116
+ # resource changed, process actual_resource_data
117
+ resource_hash = resources.delete(resource_uri)
118
+ type = resource_hash[:type]
119
+
120
+ this_time = (Time.now - time_start).round(2)
121
+ puts "\n#{resource_uri}\n- nif found in #{this_time}\n- resources to find #{resources.size}" if @console_output
122
+
123
+ result_relations = find_relations(resource_uri, actual_resource_data, type)
124
+ generate_result_file(resource_uri, type, result_relations, this_time)
125
+
126
+ break if resources.empty?
127
+
128
+ actual_resource_data = []
129
+ time_start = Time.now
130
+ end
131
+
132
+ # start new group
133
+ lines_group = [line]
134
+ else
135
+
136
+ # join line to group
137
+ lines_group << line
138
+ end
139
+
140
+ end
141
+
142
+ ensure
143
+ nif_file.close if nif_file && !nif_file.closed?
144
+ end
145
+ end
146
+
147
+ ###
148
+ # The method helps to recollect relations by already generated result files.
149
+ #
150
+ # @param [Array<String>, String] entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
151
+ # @param [Fixnum] count Count of best ranked resources
152
+ def refresh_statistics_in_files(entity_types, count = 10)
153
+ resources = get_best_ranked_resources(entity_types, count)
154
+
155
+ resources = keep_loaded(resources)
156
+
157
+ resources.each { |resource_uri, resource_info|
158
+ puts "_____ #{resource_uri} _____" if @console_output
159
+
160
+ update_nif_file_properties(resource_uri, resource_info[:type]) { |link|
161
+ get_predicates_by_link(resource_uri, link, resource_info[:type])
162
+ }
163
+ }
164
+
165
+ end
166
+
167
+
168
+ ###
169
+ # The method find predicates by given link.
170
+ # Find strict predicates that are in relation: <resource> ?predicate <link> .
171
+ # Find predicates that are in relation: ?subject a <type> . ?subject ?predicate <link>
172
+ #
173
+ # @param [String] resource_uri Resource for which will be find strict properties
174
+ # @param [String] link Link that has some importance to resource or entity type.
175
+ # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
176
+ #
177
+ # @return [Hash] result
178
+ def get_predicates_by_link(resource_uri, link, type)
179
+ properties = {type => {}}
180
+ strict_properties = {type => {}}
181
+
182
+ @query.get_all_predicates_by_subject_object(resource_uri, link).each { |solution|
183
+ predicate = solution.to_h
184
+ property = predicate[:property].to_s.force_encoding('utf-8')
185
+
186
+ next if Predicate.unimportant?(property)
187
+
188
+ count = @query.get_count_predicate_by_entity(type, property)[0].to_h[:count].to_f
189
+ strict_properties[type][property] = count if count > 0
190
+ }
191
+
192
+ @query.get_all_predicates_by_object(link).each { |solution|
193
+ predicate = solution.to_h
194
+ property = predicate[:property].to_s.force_encoding('utf-8')
195
+
196
+ next if Predicate.unimportant?(property) || strict_properties[type][property]
197
+
198
+ count = @query.get_count_predicate_by_entity(type, property)[0].to_h[:count].to_f
199
+ properties[type][property] = count if count > 0
200
+ }
201
+
202
+
203
+ {properties: properties, strict_properties: strict_properties}
204
+ end
205
+
206
+ ###
207
+ # The method helps to store founded information from nif for given resource.
208
+ #
209
+ # @param [String] resource_uri
210
+ # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
211
+ # @param [Hsah] result_relations Hash generated by method #find_relations
212
+ # @option result_relations [Hash] :sections Contains key 'section_type' value 'position'
213
+ # @option result_relations [Array<Hash>] :relations Hashes generated by method #get_predicates_by_link
214
+ #
215
+ # @param [Float] this_time Relative time of find in nif dataset.
216
+ def generate_result_file(resource_uri, type, result_relations, this_time)
217
+ section_degradation = result_relations[:sections].map { |section_type, position|
218
+ index = result_relations[:sections].keys.index(section_type)
219
+
220
+ # recognize value of degradation by relative position paragraphs in document
221
+ position[:degradation] = 1 - ((index / result_relations[:sections].size) / 10.0)
222
+
223
+ {section_type => position}
224
+ }.reduce(:merge)
225
+
226
+ total_size = section_degradation.max_by { |_, v| v[:to] }[1][:to].to_f
227
+
228
+ result_nif_data = result_relations[:relations].map { |relation|
229
+ paragraph_position = section_degradation[relation[:section]]
230
+
231
+ # weight is lowest by relative distance from document start
232
+ position_weight = (1 - ((relation[:indexes][0].to_i) / total_size))
233
+ # weight is also degraded by index of paragraph
234
+ relation[:weight] = (position_weight * paragraph_position[:degradation]).round(4)
235
+
236
+ relation
237
+ }
238
+
239
+ result = {
240
+ process_time: {nif_find: this_time, relations_find: result_relations[:time]},
241
+ resource_uri: resource_uri,
242
+ nif_data: result_nif_data
243
+ }
244
+
245
+ result_path = get_resource_file_path(resource_uri, type)
246
+ File.open(result_path, 'w:utf-8') { |f| f << JSON.pretty_generate(result) }
247
+ end
248
+
249
+ ###
250
+ # The method process all generated result files from nif dataset (by entity class type)
251
+ # to one result knowledge base file.
252
+ #
253
+ # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
254
+ # @param [Fixnum] best_count Define max count of properties that will be assign to entity class type.
255
+ # @param [TrueClass, FalseClass] identify_identical Flag for process identify and group identical properties as one item.
256
+ def generate_knowledge_base(type, best_count = 20, identify_identical = true)
257
+ puts "_____ #{type} _____" if @console_output
258
+ files = Dir.glob("#{@results_dir_path}/#{type}/*.json")
259
+ type = type.to_s.to_sym
260
+
261
+ knowledge_data = {type => []}
262
+
263
+ files.each { |file_path|
264
+ puts "- calculate #{file_path}" if @console_output
265
+ file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
266
+
267
+ if identify_identical
268
+ file_data[:nif_data].each { |data|
269
+ all_properties = data[:properties][type].keys + ((data[:strict_properties]||{})[type] || {}).keys.uniq
270
+ @predicates_similarity.identify_identical_predicates(all_properties)
271
+ }
272
+ end
273
+
274
+ file_data[:nif_data].each { |found|
275
+
276
+ properties = found[:properties][type.to_sym]
277
+ strict_properties = (found[:strict_properties] ||{})[type] || {}
278
+ weight = found[:weight]
279
+
280
+ strict_properties.each { |property, count|
281
+ property = property.to_s
282
+ value = count.to_i * weight
283
+
284
+ prepare_property_to_knowledge(property, knowledge_data[type]) { |from_knowledge|
285
+ old_score = from_knowledge[:score] * from_knowledge[:counter]
286
+ from_knowledge[:counter] += 1
287
+ (old_score + value) / from_knowledge[:counter]
288
+ }
289
+ }
290
+
291
+ properties.each { |property, count|
292
+ property = property.to_s
293
+ value = count.to_i * weight
294
+
295
+ prepare_property_to_knowledge(property, knowledge_data[type]) { |from_knowledge|
296
+ old_score = from_knowledge[:score] * from_knowledge[:counter]
297
+ from_knowledge[:counter] += 1
298
+ (old_score + value) / from_knowledge[:counter]
299
+ }
300
+ }
301
+ }
302
+
303
+ unless knowledge_data[type].empty?
304
+ max_weight = knowledge_data[type].max_by { |data| data[:score] }[:score]
305
+ knowledge_data[type] = knowledge_data[type].map { |hash|
306
+ hash[:score] = (hash[:score] / max_weight).round(4)
307
+ hash
308
+ }
309
+ end
310
+ }
311
+
312
+ global_properties = get_global_statistic_by_type(type) || {}
313
+ if identify_identical
314
+ @predicates_similarity.identify_identical_predicates(global_properties.keys)
315
+ end
316
+
317
+ if global_properties.size > 0
318
+ max_count = global_properties.max_by { |_, count| count }[1].to_f
319
+ global_properties.each { |property, count|
320
+
321
+ value = count / max_count
322
+
323
+ prepare_property_to_knowledge(property, knowledge_data[type]) { |from_knowledge|
324
+ from_knowledge[:score] > 0 ? ((from_knowledge[:score] + value) / 2.0).round(4) : value.round(4)
325
+ }
326
+ }
327
+ end
328
+
329
+ knowledge_data[type].map! { |hash|
330
+ hash.delete(:counter)
331
+ hash
332
+ }
333
+
334
+ knowledge_data[type] = knowledge_data[type].sort_by { |hash| hash[:score] }.reverse.take(best_count)
335
+
336
+ if identify_identical
337
+ @predicates_similarity.reduce_identical
338
+ end
339
+
340
+ update_knowledge_base(knowledge_data)
341
+ end
342
+
343
+ ###
344
+ # The method generate simple statistics that contain all predicates that links to literal.
345
+ # Predicates are grouped by entity class type and also contains count of total occurrence.
346
+ # Predicates find from best ranked resources.
347
+ #
348
+ # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
349
+ # @param [Fixnum] count Count of best ranked resources
350
+ def generate_literal_statistics(type = nil, count = 10)
351
+ unless type
352
+ type = get_all_classes
353
+ end
354
+
355
+ type = [type] unless type.is_a?(Array)
356
+
357
+ type.each_with_index { |entity_type, index|
358
+ all_properties = {}
359
+ puts "#{__method__} - start process entity type: #{entity_type} [#{(index / type.size.to_f).round(2)}]" if @console_output
360
+ entity_type = entity_type.to_s.to_sym
361
+
362
+ get_best_ranked_resources(entity_type, count).each { |resource, _|
363
+ properties = @query.get_all_predicates_by_subject(resource.to_s, true).map { |solution_prop|
364
+ solution_prop[:property].to_s
365
+ } || []
366
+
367
+ properties.uniq.each { |prop|
368
+ next if Predicate.unimportant?(prop)
369
+ all_properties[entity_type] ||= {}
370
+ all_properties[entity_type][prop] ||= 0
371
+ all_properties[entity_type][prop] += 1
372
+ }
373
+
374
+ }
375
+
376
+ update_global_statistic(all_properties)
377
+ }
378
+ end
379
+
380
+ ###
381
+ # The method load all defined entity class types by http://mappings.dbpedia.org/server/ontology/classes/
382
+ #
383
+ # @param [String] path
384
+ #
385
+ # @return [Hash] classes
386
+ def get_all_classes(path = File.join(__dir__,'../knowledge/classes_hierarchy.json'))
387
+ data = ensure_load_json(path, {})
388
+ HashHelper.recursive_map_keys(data)
389
+ end
390
+
391
+
392
+ private
393
+
394
+ ###
395
+ # The method helps to continue of process find links in nif dataset.
396
+ #
397
+ # @param [String] resource_uri
398
+ # @param [Hash] actual_resource_data Part data extracted from nif dataset for given resource_uri
399
+ # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
400
+ #
401
+ # @return [Hash] out
402
+ def find_relations(resource_uri, actual_resource_data, type)
403
+ out = {
404
+ sections: {},
405
+ relations: []
406
+ }
407
+
408
+ puts "- properties to find size[#{actual_resource_data.size}]" if @console_output
409
+
410
+ time = Benchmark.realtime {
411
+ out[:relations] = actual_resource_data.map! { |resource_data|
412
+ section_group = resource_data[:section].scan(SCAN_REGEXP[:group])
413
+
414
+ type_key = resource_data[:section].force_encoding('utf-8')
415
+
416
+ out[:sections][type_key] ||= {
417
+ type: section_group[0][0],
418
+ from: section_group[0][1].to_i,
419
+ to: section_group[0][2].to_i,
420
+ }
421
+
422
+ result = get_predicates_by_link(resource_uri, resource_data[:link], type)
423
+
424
+ resource_data[:properties] = result[:properties]
425
+ resource_data[:strict_properties] = result[:strict_properties]
426
+
427
+ resource_data
428
+ }.compact || []
429
+ }
430
+
431
+ out[:time] = time.round(2)
432
+
433
+ puts "- properties found in #{out[:time]}" if @console_output
434
+
435
+ out
436
+ end
437
+
438
+
439
+ ###
440
+ # The method generate file path for given resource URI.
441
+ # Also ensure to exist sub directory by resource entity type.
442
+ #
443
+ # @param [String] resource_uri
444
+ # @param [String] type
445
+ #
446
+ # @return [String] resource_file_path
447
+ def get_resource_file_path(resource_uri, type)
448
+ type = type.split('/').last
449
+ resource_name = resource_uri.split('/').last
450
+
451
+ dir_path = "#{@results_dir_path}/#{type}"
452
+ Dir.mkdir(dir_path) unless Dir.exist?(dir_path)
453
+
454
+ "#{dir_path}/#{StringHelper.get_clear_file_path(resource_name)}.json"
455
+ end
456
+
457
+ ###
458
+ # The method helps update found predicates to stored links.
459
+ #
460
+ # @param [String] resource_uri
461
+ # @param [String] type
462
+ #
463
+ # @return [Array<Hash>] old_data, new_data
464
+ def update_nif_file_properties(resource_uri, type)
465
+ if block_given?
466
+ path = get_resource_file_path(resource_uri, type)
467
+ old_data = ensure_load_json(path, {}, symbolize_names: true)
468
+
469
+ new_data = old_data.dup
470
+
471
+ time = Benchmark.realtime {
472
+ new_data[:nif_data] = old_data[:nif_data].map { |hash|
473
+ actual_link = hash[:link].to_sym
474
+
475
+ result = yield actual_link
476
+
477
+ hash[:strict_properties] = result[:strict_properties] if result[:strict_properties]
478
+ hash[:properties] = result[:properties] if result[:properties]
479
+
480
+ hash
481
+ }
482
+ }
483
+
484
+ new_data[:process_time][:relations_find] = time.round(2)
485
+
486
+ File.write(path, JSON.pretty_generate(new_data))
487
+ return old_data, new_data
488
+ end
489
+ end
490
+
491
+ ###
492
+ # The method in yield block give founded hash for required property.
493
+ # This hash contains counter, score and also all identical properties.
494
+ # At the end update score that was get from yield block as return value.
495
+ #
496
+ # @param [String] property
497
+ # @param [Array<Hash>] this_knowledge_data
498
+ #
499
+ # @yield param found
500
+ # @yield return score
501
+ def prepare_property_to_knowledge(property, this_knowledge_data)
502
+ property = property.to_s
503
+
504
+ this_knowledge_data ||= []
505
+ found = this_knowledge_data.find { |data| data[:predicates].include?(property) }
506
+
507
+ if found.nil? || found.empty?
508
+ # add new
509
+
510
+ identical_properties = @predicates_similarity.find_identical(property)
511
+
512
+ found = {
513
+ counter: 0,
514
+ score: 0.0,
515
+ predicates: identical_properties || [property.to_s]
516
+ }
517
+
518
+ this_knowledge_data << found
519
+ end
520
+
521
+ new_score = yield found
522
+
523
+
524
+ found[:score] = new_score
525
+ end
526
+
527
+ ###
528
+ # The method delete all resources that already has created result file
529
+ #
530
+ # @param [Hash{resource=>type}] resources
531
+ def keep_unloaded(resources)
532
+ resources.delete_if { |resource, values|
533
+ dir_path = "#{@results_dir_path}/#{values[:type]}"
534
+ resource_name = resource.split('/').last
535
+ File.exists?("#{dir_path}/#{StringHelper.get_clear_file_path(resource_name)}.json")
536
+ }
537
+ end
538
+
539
+ ###
540
+ # The method keep all resources that already has created result file
541
+ #
542
+ # @param [Hash{resource=>type}] resources
543
+ def keep_loaded(resources)
544
+ resources.keep_if { |resource, values|
545
+ dir_path = "#{@results_dir_path}/#{values[:type]}"
546
+ resource_name = resource.split('/').last
547
+ File.exists?("#{dir_path}/#{StringHelper.get_clear_file_path(resource_name)}.json")
548
+ }
549
+ end
550
+
551
+ ###
552
+ # The method allow to update knowledge base by every entity class type.
553
+ #
554
+ # @param [Hash] new_data
555
+ def update_knowledge_base(new_data)
556
+ path = "#{@results_dir_path}/knowledge_base.json"
557
+ old_data = ensure_load_json(path, {}, symbolize_names: true)
558
+ File.write(path, JSON.pretty_generate(old_data.merge(new_data)))
559
+ end
560
+
561
+ ###
562
+ # The method allow to update global statistic by every entity class type.
563
+ #
564
+ # @param [Hash] new_data
565
+ def update_global_statistic(new_data)
566
+ path = "#{@results_dir_path}/global_statistic.json"
567
+ old_data = ensure_load_json(path, {}, symbolize_names: true)
568
+ File.write(path, JSON.pretty_generate(old_data.merge(new_data)))
569
+ end
570
+
571
+ ###
572
+ # The method returns global properties for given entity class type.
573
+ #
574
+ # @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
575
+ #
576
+ # @return [Hash] global_statistic_by_type
577
+ def get_global_statistic_by_type(type)
578
+ type = type.to_s.to_sym
579
+ path = "#{@results_dir_path}/global_statistic.json"
580
+ data = ensure_load_json(path, {}, symbolize_names: true)
581
+ data[type]
582
+ end
583
+
584
+ ###
585
+ # The method helps to load json file.
586
+ #
587
+ # @param [String] file_path
588
+ # @param [String] def_val If no exist file add values as default.
589
+ # @param [Hash] json_params JSON.parse params
590
+ #
591
+ # @return [Object] json
592
+ def ensure_load_json(file_path, def_val, json_params = {})
593
+ if File.exists?(file_path)
594
+ file_data = File.read(file_path).force_encoding('utf-8')
595
+ if file_data.size >= 2 # '[]'
596
+ JSON.parse(file_data, json_params)
597
+ else
598
+ def_val
599
+ end
600
+ else
601
+ def_val
602
+ end
603
+ end
604
+
605
+
606
+ end
607
+ end
608
+
609
+ end