browser_web_data_entity_sumarization 1.0.0beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/browser_web_data_entity_sumarization.rb +23 -0
- data/lib/browser_web_data_entity_sumarization/entity_sumarization_nif_parser.rb +59 -0
- data/lib/browser_web_data_entity_sumarization/entity_sumarization_predicate.rb +33 -0
- data/lib/browser_web_data_entity_sumarization/entity_sumarization_predicates_similarity.rb +263 -0
- data/lib/browser_web_data_entity_sumarization/entity_sumarization_statistics.rb +609 -0
- data/lib/browser_web_data_entity_sumarization/sparql_request.rb +74 -0
- data/lib/browser_web_data_entity_sumarization/version.rb +3 -0
- data/lib/config/entity_sumarization_config.rb +48 -0
- data/lib/knowledge/classes_hierarchy.json +906 -0
- data/lib/knowledge/common_properties.json +23 -0
- data/lib/knowledge/entity_classes.json +1 -0
- data/lib/knowledge/knowledge_base.json +40642 -0
- data/lib/utils/cache_helper.rb +69 -0
- data/lib/utils/hash_helper.rb +79 -0
- data/lib/utils/sparql_queries.rb +126 -0
- data/lib/utils/string_helper.rb +31 -0
- metadata +74 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b24bd4b2cd4ae8c908f6549710531303a49c36e5
|
4
|
+
data.tar.gz: 0c48cc2e23703a28dbe262c3535703b87698f63f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 786bfb65d8250af636e9a1187cc7530738e734252d405c53bc10c0f0c03ba0ea4953fca5b32613817c50c4c67200df09ea31ff96f0a2ec8f0c481b53a965e00b
|
7
|
+
data.tar.gz: 3448f20b4aa7b7bd457da33479435fd557ef14c87f13264cf3a99286ecd053511cbf341f0ae2f1cb75a516fe5bf4d9e73bd6a0e46b7de701d93d1938f6851c12
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
require 'sparql/client'
|
3
|
+
require 'benchmark'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
module BrowserWebData
|
7
|
+
module EntitySumarization
|
8
|
+
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
Dir.glob(File.dirname(__FILE__) + '/utils/*.rb').each { |file| require file }
|
13
|
+
Dir.glob(File.dirname(__FILE__) + '/config/*.rb').each { |file| require file }
|
14
|
+
|
15
|
+
# Require all gem scripts by their relative names
|
16
|
+
Dir[File.dirname(__FILE__) + '/browser_web_data_entity_sumarization/**/*.rb'].each do |file|
|
17
|
+
require(file.gsub('\\', '/').split('/lib/').last[0..-4])
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# Core project module
|
5
|
+
module BrowserWebData
|
6
|
+
|
7
|
+
###
|
8
|
+
# Project logic module
|
9
|
+
module EntitySumarization
|
10
|
+
|
11
|
+
###
|
12
|
+
# The class include helpers to retrieve structured nif data from nif lines.
|
13
|
+
class NIFLineParser
|
14
|
+
include BrowserWebData::EntitySumarizationConfig
|
15
|
+
|
16
|
+
###
|
17
|
+
# The method apply scan to recognize resource uri from given nif dataset line.
|
18
|
+
#
|
19
|
+
# @param [String] line
|
20
|
+
#
|
21
|
+
# @return [String] resource_uri
|
22
|
+
# @example resource_uri: "http://dbpedia.org/resource/Captain_EO"
|
23
|
+
def self.parse_resource_uri(line)
|
24
|
+
(line.scan(SCAN_REGEXP[:scan_resource])[0])[0].split('?').first
|
25
|
+
end
|
26
|
+
|
27
|
+
###
|
28
|
+
# The method apply scan to recognize link, anchor, indexes and section from given nif dataset group of 7 lines.
|
29
|
+
#
|
30
|
+
# @param [Array<String>] lines_group
|
31
|
+
#
|
32
|
+
# @return [Hash] nif_data
|
33
|
+
# @example nif_data:
|
34
|
+
# {
|
35
|
+
# link: "http://dbpedia.org/resource/Science_fiction_film",
|
36
|
+
# anchor: "science fiction film",
|
37
|
+
# indexes: ["33", "53"],
|
38
|
+
# section: "paragraph_0_419"
|
39
|
+
# }
|
40
|
+
def self.parse_line_group(lines_group)
|
41
|
+
begin_index = lines_group[2].scan(SCAN_REGEXP[:begin_index])[0]
|
42
|
+
end_index = lines_group[3].scan(SCAN_REGEXP[:end_index])[0]
|
43
|
+
target_resource_link = lines_group[5].scan(SCAN_REGEXP[:target_resource_link])[0]
|
44
|
+
section = lines_group[4].scan(SCAN_REGEXP[:section])[0]
|
45
|
+
anchor = lines_group[6].scan(SCAN_REGEXP[:anchor])[0]
|
46
|
+
|
47
|
+
{
|
48
|
+
link: target_resource_link[1].force_encoding('utf-8'),
|
49
|
+
anchor: anchor[1].force_encoding('utf-8'),
|
50
|
+
indexes: [begin_index[1], end_index[1]],
|
51
|
+
section: section[0].split('=')[1]
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# Core project module
|
5
|
+
module BrowserWebData
|
6
|
+
|
7
|
+
###
|
8
|
+
# Project logic module
|
9
|
+
module EntitySumarization
|
10
|
+
|
11
|
+
|
12
|
+
###
|
13
|
+
# The class include helper methods.
|
14
|
+
# (todo definition of predicate instance)
|
15
|
+
class Predicate
|
16
|
+
include BrowserWebData::EntitySumarizationConfig
|
17
|
+
|
18
|
+
###
|
19
|
+
# The method helps identify unimportant predicate by constants.
|
20
|
+
#
|
21
|
+
# @param [String] property
|
22
|
+
#
|
23
|
+
# @return [TrueClass, FalseClass] result
|
24
|
+
def self.unimportant?(property)
|
25
|
+
property = property.to_s
|
26
|
+
NO_SENSE_PROPERTIES.include?(property) || COMMON_PROPERTIES.include?(property)
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,263 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# Core project module
|
5
|
+
module BrowserWebData
|
6
|
+
|
7
|
+
###
|
8
|
+
# Project logic module
|
9
|
+
module EntitySumarization
|
10
|
+
|
11
|
+
###
|
12
|
+
# The class include methods to identify identical predicates
|
13
|
+
class PredicatesSimilarity
|
14
|
+
include BrowserWebData::EntitySumarizationConfig
|
15
|
+
|
16
|
+
def initialize(results_dir_path, identical_limit = IDENTICAL_PROPERTY_LIMIT, console_output = false)
|
17
|
+
@results_dir_path = results_dir_path
|
18
|
+
@console_output = console_output
|
19
|
+
@identical_limit = identical_limit
|
20
|
+
|
21
|
+
@query = SPARQLRequest.new
|
22
|
+
|
23
|
+
load_identical_predicates
|
24
|
+
load_different_predicates
|
25
|
+
end
|
26
|
+
|
27
|
+
###
|
28
|
+
# The method return key of identical predicates
|
29
|
+
#
|
30
|
+
# @param [Array<String>] predicates
|
31
|
+
#
|
32
|
+
# @return [String] key
|
33
|
+
def self.get_key(predicates)
|
34
|
+
predicates = [predicates] unless predicates.is_a?(Array)
|
35
|
+
"<#{predicates.join('><')}>" if predicates && !predicates.empty?
|
36
|
+
end
|
37
|
+
|
38
|
+
###
|
39
|
+
# The method return identical predicates by key
|
40
|
+
#
|
41
|
+
# @param [String] key
|
42
|
+
#
|
43
|
+
# @return [Array<String>] predicates
|
44
|
+
def self.parse_key(key)
|
45
|
+
key.to_s.scan(SCAN_REGEXP[:identical_key]).reduce(:+)
|
46
|
+
end
|
47
|
+
|
48
|
+
###
|
49
|
+
# The method verify every combination of two predicates.
|
50
|
+
# Method store identify combination in two files identical_predicates.json and different_predicates.json
|
51
|
+
# files contains Array of combination keys
|
52
|
+
#
|
53
|
+
# @param [Array<String>] predicates
|
54
|
+
def identify_identical_predicates(predicates, identical_limit = @identical_limit)
|
55
|
+
@temp_counts ||= {}
|
56
|
+
|
57
|
+
predicates.combination(2).each { |values|
|
58
|
+
|
59
|
+
already_mark_same = find_identical(values)
|
60
|
+
already_mark_different = find_different(values)
|
61
|
+
|
62
|
+
if already_mark_same.nil? && already_mark_different.nil?
|
63
|
+
|
64
|
+
# in case of dbpedia ontology vs. property
|
65
|
+
# automatically became identical
|
66
|
+
unless try_auto_identical(values)
|
67
|
+
|
68
|
+
unless @temp_counts[values[0]]
|
69
|
+
@temp_counts[values[0]] = @query.get_count_of_identical_predicates(values[0])
|
70
|
+
end
|
71
|
+
|
72
|
+
unless @temp_counts[values[1]]
|
73
|
+
@temp_counts[values[1]] = @query.get_count_of_identical_predicates(values[1])
|
74
|
+
end
|
75
|
+
|
76
|
+
x = @temp_counts[values[0]]
|
77
|
+
y = @temp_counts[values[1]]
|
78
|
+
z = @query.get_count_of_identical_predicates(values)
|
79
|
+
|
80
|
+
identical_level = z / [x, y].max
|
81
|
+
|
82
|
+
if identical_level >= identical_limit
|
83
|
+
puts " - result[#{identical_level}] z[#{z}] x[#{x}] y[#{y}] #{values.inspect}" if @console_output
|
84
|
+
add_identical(values)
|
85
|
+
else
|
86
|
+
add_different(values)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
true
|
93
|
+
}
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
###
|
98
|
+
# The method helps to recognize if is already marked as identical properties
|
99
|
+
#
|
100
|
+
# @param [Array<String>, String] value
|
101
|
+
#
|
102
|
+
# @return [String, NilClass]
|
103
|
+
def find_identical(value)
|
104
|
+
raise RuntimeError.new('No support identify identical for more than 2 predicates.') if value.is_a?(Array) && value.size >2
|
105
|
+
|
106
|
+
predicates_key = case value
|
107
|
+
when Array
|
108
|
+
value = value.map { |v| PredicatesSimilarity.get_key(v) }
|
109
|
+
@identical_predicates.find { |p|
|
110
|
+
p[value[0]] && p[value[1]]
|
111
|
+
}
|
112
|
+
else
|
113
|
+
value = PredicatesSimilarity.get_key(value)
|
114
|
+
@identical_predicates.find { |p|
|
115
|
+
p[value]
|
116
|
+
}
|
117
|
+
end
|
118
|
+
|
119
|
+
PredicatesSimilarity.parse_key(predicates_key)
|
120
|
+
end
|
121
|
+
|
122
|
+
###
|
123
|
+
# The method helps to recognize if is already marked as different properties
|
124
|
+
#
|
125
|
+
# @param [Array<String>, String] value
|
126
|
+
#
|
127
|
+
# @return [String, NilClass]
|
128
|
+
def find_different(value)
|
129
|
+
raise RuntimeError.new('No support identify identical for more than 2 predicates.') if value.is_a?(Array) && value.size >2
|
130
|
+
|
131
|
+
key = case value
|
132
|
+
when Array
|
133
|
+
value = value.map { |v| PredicatesSimilarity.get_key(v) }
|
134
|
+
@different_predicates.find { |p| p[value[0]] && p[value[1]] }
|
135
|
+
else
|
136
|
+
value = PredicatesSimilarity.get_key(value)
|
137
|
+
@different_predicates.find { |p| p[value] }
|
138
|
+
end
|
139
|
+
|
140
|
+
PredicatesSimilarity.parse_key(key)
|
141
|
+
end
|
142
|
+
|
143
|
+
def add_identical(values)
|
144
|
+
values = values.map { |p| p.to_s }.uniq.sort
|
145
|
+
group_key = PredicatesSimilarity.get_key(values)
|
146
|
+
|
147
|
+
unless @identical_predicates.include?(group_key)
|
148
|
+
@identical_predicates << group_key
|
149
|
+
store_identical_properties
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def add_different(values)
|
154
|
+
values = values.map { |p| p.to_s }.uniq.sort
|
155
|
+
group_key = PredicatesSimilarity.get_key(values)
|
156
|
+
|
157
|
+
unless @different_predicates.include?(group_key)
|
158
|
+
@different_predicates << group_key
|
159
|
+
|
160
|
+
@new_diff_counter ||= 0
|
161
|
+
@new_diff_counter += 1
|
162
|
+
|
163
|
+
if @new_diff_counter > 100
|
164
|
+
store_different_predicates
|
165
|
+
@new_diff_counter = 0
|
166
|
+
end
|
167
|
+
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def try_auto_identical(values)
|
172
|
+
group_key = PredicatesSimilarity.get_key(values)
|
173
|
+
|
174
|
+
temp = values.map { |val| val.to_s.split('/').last }.uniq
|
175
|
+
if temp.size == 1 && group_key['property/'] && group_key['ontology/']
|
176
|
+
add_identical(values)
|
177
|
+
true
|
178
|
+
else
|
179
|
+
false
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
|
184
|
+
###
|
185
|
+
# The method helps to reduce identical predicates by join of common predicate
|
186
|
+
def reduce_identical
|
187
|
+
new_identical = []
|
188
|
+
|
189
|
+
@identical_predicates.each { |key|
|
190
|
+
values = PredicatesSimilarity.parse_key(key)
|
191
|
+
next if new_identical.find { |v| !(v & values).empty? }
|
192
|
+
|
193
|
+
## find nodes with values predicates
|
194
|
+
values = recursive_find_identical(key, values)
|
195
|
+
|
196
|
+
new_identical << values.uniq.sort
|
197
|
+
}
|
198
|
+
|
199
|
+
@identical_predicates = new_identical.map { |v| PredicatesSimilarity.get_key(v) }
|
200
|
+
|
201
|
+
store_identical_properties
|
202
|
+
end
|
203
|
+
|
204
|
+
def recursive_find_identical(keys, values)
|
205
|
+
keys = [keys] unless keys.is_a?(Array)
|
206
|
+
|
207
|
+
@identical_predicates.each { |this_key|
|
208
|
+
next if keys.include?(this_key)
|
209
|
+
temp = PredicatesSimilarity.parse_key(this_key)
|
210
|
+
|
211
|
+
unless (temp & values).empty?
|
212
|
+
keys << this_key
|
213
|
+
return recursive_find_identical(keys, (values + temp).uniq)
|
214
|
+
end
|
215
|
+
}
|
216
|
+
|
217
|
+
values
|
218
|
+
end
|
219
|
+
|
220
|
+
|
221
|
+
private
|
222
|
+
|
223
|
+
|
224
|
+
def load_identical_predicates
|
225
|
+
unless @identical_predicates
|
226
|
+
file_path = "#{@results_dir_path}/identical_predicates.json"
|
227
|
+
@identical_predicates = ensure_load_json(file_path, [])
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
def load_different_predicates
|
232
|
+
unless @different_predicates
|
233
|
+
file_path = "#{@results_dir_path}/different_predicates.json"
|
234
|
+
@different_predicates = ensure_load_json(file_path, [])
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
def store_identical_properties
|
239
|
+
File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
|
240
|
+
end
|
241
|
+
|
242
|
+
def store_different_predicates
|
243
|
+
File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
|
244
|
+
end
|
245
|
+
|
246
|
+
def ensure_load_json(file_path, def_val, json_params = {})
|
247
|
+
if File.exists?(file_path)
|
248
|
+
file_data = File.read(file_path).force_encoding('utf-8')
|
249
|
+
if file_data.size >= 2 # '[]'
|
250
|
+
JSON.parse(file_data, json_params)
|
251
|
+
else
|
252
|
+
def_val
|
253
|
+
end
|
254
|
+
else
|
255
|
+
def_val
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
end
|
260
|
+
|
261
|
+
end
|
262
|
+
|
263
|
+
end
|
@@ -0,0 +1,609 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# Core project module
|
5
|
+
module BrowserWebData
|
6
|
+
|
7
|
+
###
|
8
|
+
# Project logic module
|
9
|
+
module EntitySumarization
|
10
|
+
|
11
|
+
###
|
12
|
+
# Statistic class allow to find, collect and generate knowledge of entity sumarization.
|
13
|
+
# Entity sumarization is based on use dataset of NLP Interchange Format (NIF).
|
14
|
+
# For example datasets from http://wiki.dbpedia.org/nif-abstract-datasets
|
15
|
+
# Knowledge is generate by information in DBpedia.
|
16
|
+
class Statistic
|
17
|
+
include BrowserWebData::EntitySumarizationConfig
|
18
|
+
|
19
|
+
attr_reader :nif_file_path, :results_dir_path
|
20
|
+
|
21
|
+
###
|
22
|
+
# Create new instance.
|
23
|
+
#
|
24
|
+
# @param [String] nif_dataset_path
|
25
|
+
# @param [String] results_dir_path
|
26
|
+
# @param [TrueClass, FalseClass] console_output Allow puts info to console. Default is false.
|
27
|
+
def initialize(nif_dataset_path, results_dir_path = File.join(__dir__, '../../results'), console_output = false)
|
28
|
+
nif_dataset_path = nif_dataset_path.gsub('\\', '/')
|
29
|
+
results_dir_path = results_dir_path.gsub('\\', '/').chomp('/')
|
30
|
+
|
31
|
+
return false unless File.exists?(nif_dataset_path)
|
32
|
+
return false unless File.exists?(results_dir_path)
|
33
|
+
|
34
|
+
@nif_file_path = nif_dataset_path.gsub('\\', '/')
|
35
|
+
@results_dir_path = results_dir_path.gsub('\\', '/').chomp('/')
|
36
|
+
@console_output = console_output
|
37
|
+
|
38
|
+
@query = SPARQLRequest.new
|
39
|
+
@predicates_similarity = PredicatesSimilarity.new(@results_dir_path)
|
40
|
+
end
|
41
|
+
|
42
|
+
###
|
43
|
+
# The method find resource links in given nif file dataset.
|
44
|
+
#
|
45
|
+
# @param [Hash] params
|
46
|
+
# @option params [Array<String>, String] :entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
|
47
|
+
# @option params [Fixnum] :entity_count Best ranked resources by every entity type.
|
48
|
+
# @option params [Fixnum] :best_score_count Count of result predicates to keep.
|
49
|
+
# @option params [FalseClass, TruesClass] :demand_reload
|
50
|
+
# @option params [FalseClass, TruesClass] :identity_identical_predicates
|
51
|
+
def create_by_nif_dataset(params)
|
52
|
+
params[:entity_types] = [params[:entity_types]] unless params[:entity_types].is_a?(Array)
|
53
|
+
|
54
|
+
generate_statistics_from_nif(params[:entity_types], params[:entity_count], params[:demand_reload])
|
55
|
+
|
56
|
+
params[:entity_types].each { |type|
|
57
|
+
generate_knowledge_base(type, params[:best_score_count], params[:identity_identical_predicates])
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
###
|
62
|
+
# The method return list of best ranked resources by required entity types.
|
63
|
+
#
|
64
|
+
# @param [Array<String>, String] entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
|
65
|
+
# @param [Fixnum] count Count of best ranked resources
|
66
|
+
#
|
67
|
+
# @return [Hash] resources
|
68
|
+
def get_best_ranked_resources(entity_types, count = 10)
|
69
|
+
resources = {}
|
70
|
+
entity_types = [entity_types] unless entity_types.is_a?(Array)
|
71
|
+
|
72
|
+
entity_types.each { |type|
|
73
|
+
top_ranked_entities = @query.get_resources_by_dbpedia_page_rank(type, count)
|
74
|
+
|
75
|
+
top_ranked_entities.each { |solution|
|
76
|
+
resources[solution.entity.value] = {type: type, rank: solution.rank.value.to_f}
|
77
|
+
}
|
78
|
+
}
|
79
|
+
|
80
|
+
resources
|
81
|
+
end
|
82
|
+
|
83
|
+
###
|
84
|
+
# The method find links in given nif dataset. After find collect relations #find_relations.
|
85
|
+
# For each resource generate file in @results_dir_path.
|
86
|
+
#
|
87
|
+
# @param [Array<String>, String] entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
|
88
|
+
# @param [Fixnum] count Count of best ranked resources
|
89
|
+
# @param [FalseClass, TruesClass] demand_reload
|
90
|
+
def generate_statistics_from_nif(entity_types, count = 10, demand_reload = false)
|
91
|
+
resources = get_best_ranked_resources(entity_types, count)
|
92
|
+
|
93
|
+
resources = keep_unloaded(resources) unless demand_reload
|
94
|
+
|
95
|
+
actual_resource_data = []
|
96
|
+
lines_group = []
|
97
|
+
|
98
|
+
begin
|
99
|
+
time_start = Time.now
|
100
|
+
nif_file = File.open(@nif_file_path, 'r')
|
101
|
+
line = nif_file.readline
|
102
|
+
|
103
|
+
until nif_file.eof?
|
104
|
+
line = nif_file.readline
|
105
|
+
|
106
|
+
if lines_group.size == 7
|
107
|
+
# evaulate group (7 lines)
|
108
|
+
this_resource_uri = NIFLineParser.parse_resource_uri(lines_group[0])
|
109
|
+
|
110
|
+
if resources.keys.include?(this_resource_uri)
|
111
|
+
# process group, is requested
|
112
|
+
resource_uri = this_resource_uri
|
113
|
+
actual_resource_data << NIFLineParser.parse_line_group(lines_group)
|
114
|
+
|
115
|
+
elsif !actual_resource_data.empty?
|
116
|
+
# resource changed, process actual_resource_data
|
117
|
+
resource_hash = resources.delete(resource_uri)
|
118
|
+
type = resource_hash[:type]
|
119
|
+
|
120
|
+
this_time = (Time.now - time_start).round(2)
|
121
|
+
puts "\n#{resource_uri}\n- nif found in #{this_time}\n- resources to find #{resources.size}" if @console_output
|
122
|
+
|
123
|
+
result_relations = find_relations(resource_uri, actual_resource_data, type)
|
124
|
+
generate_result_file(resource_uri, type, result_relations, this_time)
|
125
|
+
|
126
|
+
break if resources.empty?
|
127
|
+
|
128
|
+
actual_resource_data = []
|
129
|
+
time_start = Time.now
|
130
|
+
end
|
131
|
+
|
132
|
+
# start new group
|
133
|
+
lines_group = [line]
|
134
|
+
else
|
135
|
+
|
136
|
+
# join line to group
|
137
|
+
lines_group << line
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
141
|
+
|
142
|
+
ensure
|
143
|
+
nif_file.close if nif_file && !nif_file.closed?
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
###
|
148
|
+
# The method helps to recollect relations by already generated result files.
|
149
|
+
#
|
150
|
+
# @param [Array<String>, String] entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
|
151
|
+
# @param [Fixnum] count Count of best ranked resources
|
152
|
+
def refresh_statistics_in_files(entity_types, count = 10)
|
153
|
+
resources = get_best_ranked_resources(entity_types, count)
|
154
|
+
|
155
|
+
resources = keep_loaded(resources)
|
156
|
+
|
157
|
+
resources.each { |resource_uri, resource_info|
|
158
|
+
puts "_____ #{resource_uri} _____" if @console_output
|
159
|
+
|
160
|
+
update_nif_file_properties(resource_uri, resource_info[:type]) { |link|
|
161
|
+
get_predicates_by_link(resource_uri, link, resource_info[:type])
|
162
|
+
}
|
163
|
+
}
|
164
|
+
|
165
|
+
end
|
166
|
+
|
167
|
+
|
168
|
+
###
|
169
|
+
# The method find predicates by given link.
|
170
|
+
# Find strict predicates that are in relation: <resource> ?predicate <link> .
|
171
|
+
# Find predicates that are in relation: ?subject a <type> . ?subject ?predicate <link>
|
172
|
+
#
|
173
|
+
# @param [String] resource_uri Resource for which will be find strict properties
|
174
|
+
# @param [String] link Link that has some importance to resource or entity type.
|
175
|
+
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
176
|
+
#
|
177
|
+
# @return [Hash] result
|
178
|
+
def get_predicates_by_link(resource_uri, link, type)
|
179
|
+
properties = {type => {}}
|
180
|
+
strict_properties = {type => {}}
|
181
|
+
|
182
|
+
@query.get_all_predicates_by_subject_object(resource_uri, link).each { |solution|
|
183
|
+
predicate = solution.to_h
|
184
|
+
property = predicate[:property].to_s.force_encoding('utf-8')
|
185
|
+
|
186
|
+
next if Predicate.unimportant?(property)
|
187
|
+
|
188
|
+
count = @query.get_count_predicate_by_entity(type, property)[0].to_h[:count].to_f
|
189
|
+
strict_properties[type][property] = count if count > 0
|
190
|
+
}
|
191
|
+
|
192
|
+
@query.get_all_predicates_by_object(link).each { |solution|
|
193
|
+
predicate = solution.to_h
|
194
|
+
property = predicate[:property].to_s.force_encoding('utf-8')
|
195
|
+
|
196
|
+
next if Predicate.unimportant?(property) || strict_properties[type][property]
|
197
|
+
|
198
|
+
count = @query.get_count_predicate_by_entity(type, property)[0].to_h[:count].to_f
|
199
|
+
properties[type][property] = count if count > 0
|
200
|
+
}
|
201
|
+
|
202
|
+
|
203
|
+
{properties: properties, strict_properties: strict_properties}
|
204
|
+
end
|
205
|
+
|
206
|
+
###
|
207
|
+
# The method helps to store founded information from nif for given resource.
|
208
|
+
#
|
209
|
+
# @param [String] resource_uri
|
210
|
+
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
211
|
+
# @param [Hsah] result_relations Hash generated by method #find_relations
|
212
|
+
# @option result_relations [Hash] :sections Contains key 'section_type' value 'position'
|
213
|
+
# @option result_relations [Array<Hash>] :relations Hashes generated by method #get_predicates_by_link
|
214
|
+
#
|
215
|
+
# @param [Float] this_time Relative time of find in nif dataset.
|
216
|
+
def generate_result_file(resource_uri, type, result_relations, this_time)
|
217
|
+
section_degradation = result_relations[:sections].map { |section_type, position|
|
218
|
+
index = result_relations[:sections].keys.index(section_type)
|
219
|
+
|
220
|
+
# recognize value of degradation by relative position paragraphs in document
|
221
|
+
position[:degradation] = 1 - ((index / result_relations[:sections].size) / 10.0)
|
222
|
+
|
223
|
+
{section_type => position}
|
224
|
+
}.reduce(:merge)
|
225
|
+
|
226
|
+
total_size = section_degradation.max_by { |_, v| v[:to] }[1][:to].to_f
|
227
|
+
|
228
|
+
result_nif_data = result_relations[:relations].map { |relation|
|
229
|
+
paragraph_position = section_degradation[relation[:section]]
|
230
|
+
|
231
|
+
# weight is lowest by relative distance from document start
|
232
|
+
position_weight = (1 - ((relation[:indexes][0].to_i) / total_size))
|
233
|
+
# weight is also degraded by index of paragraph
|
234
|
+
relation[:weight] = (position_weight * paragraph_position[:degradation]).round(4)
|
235
|
+
|
236
|
+
relation
|
237
|
+
}
|
238
|
+
|
239
|
+
result = {
|
240
|
+
process_time: {nif_find: this_time, relations_find: result_relations[:time]},
|
241
|
+
resource_uri: resource_uri,
|
242
|
+
nif_data: result_nif_data
|
243
|
+
}
|
244
|
+
|
245
|
+
result_path = get_resource_file_path(resource_uri, type)
|
246
|
+
File.open(result_path, 'w:utf-8') { |f| f << JSON.pretty_generate(result) }
|
247
|
+
end
|
248
|
+
|
249
|
+
###
|
250
|
+
# The method process all generated result files from nif dataset (by entity class type)
|
251
|
+
# to one result knowledge base file.
|
252
|
+
#
|
253
|
+
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
254
|
+
# @param [Fixnum] best_count Define max count of properties that will be assign to entity class type.
|
255
|
+
# @param [TrueClass, FalseClass] identify_identical Flag for process identify and group identical properties as one item.
|
256
|
+
def generate_knowledge_base(type, best_count = 20, identify_identical = true)
|
257
|
+
puts "_____ #{type} _____" if @console_output
|
258
|
+
files = Dir.glob("#{@results_dir_path}/#{type}/*.json")
|
259
|
+
type = type.to_s.to_sym
|
260
|
+
|
261
|
+
knowledge_data = {type => []}
|
262
|
+
|
263
|
+
files.each { |file_path|
|
264
|
+
puts "- calculate #{file_path}" if @console_output
|
265
|
+
file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
|
266
|
+
|
267
|
+
if identify_identical
|
268
|
+
file_data[:nif_data].each { |data|
|
269
|
+
all_properties = data[:properties][type].keys + ((data[:strict_properties]||{})[type] || {}).keys.uniq
|
270
|
+
@predicates_similarity.identify_identical_predicates(all_properties)
|
271
|
+
}
|
272
|
+
end
|
273
|
+
|
274
|
+
file_data[:nif_data].each { |found|
|
275
|
+
|
276
|
+
properties = found[:properties][type.to_sym]
|
277
|
+
strict_properties = (found[:strict_properties] ||{})[type] || {}
|
278
|
+
weight = found[:weight]
|
279
|
+
|
280
|
+
strict_properties.each { |property, count|
|
281
|
+
property = property.to_s
|
282
|
+
value = count.to_i * weight
|
283
|
+
|
284
|
+
prepare_property_to_knowledge(property, knowledge_data[type]) { |from_knowledge|
|
285
|
+
old_score = from_knowledge[:score] * from_knowledge[:counter]
|
286
|
+
from_knowledge[:counter] += 1
|
287
|
+
(old_score + value) / from_knowledge[:counter]
|
288
|
+
}
|
289
|
+
}
|
290
|
+
|
291
|
+
properties.each { |property, count|
|
292
|
+
property = property.to_s
|
293
|
+
value = count.to_i * weight
|
294
|
+
|
295
|
+
prepare_property_to_knowledge(property, knowledge_data[type]) { |from_knowledge|
|
296
|
+
old_score = from_knowledge[:score] * from_knowledge[:counter]
|
297
|
+
from_knowledge[:counter] += 1
|
298
|
+
(old_score + value) / from_knowledge[:counter]
|
299
|
+
}
|
300
|
+
}
|
301
|
+
}
|
302
|
+
|
303
|
+
unless knowledge_data[type].empty?
|
304
|
+
max_weight = knowledge_data[type].max_by { |data| data[:score] }[:score]
|
305
|
+
knowledge_data[type] = knowledge_data[type].map { |hash|
|
306
|
+
hash[:score] = (hash[:score] / max_weight).round(4)
|
307
|
+
hash
|
308
|
+
}
|
309
|
+
end
|
310
|
+
}
|
311
|
+
|
312
|
+
global_properties = get_global_statistic_by_type(type) || {}
|
313
|
+
if identify_identical
|
314
|
+
@predicates_similarity.identify_identical_predicates(global_properties.keys)
|
315
|
+
end
|
316
|
+
|
317
|
+
if global_properties.size > 0
|
318
|
+
max_count = global_properties.max_by { |_, count| count }[1].to_f
|
319
|
+
global_properties.each { |property, count|
|
320
|
+
|
321
|
+
value = count / max_count
|
322
|
+
|
323
|
+
prepare_property_to_knowledge(property, knowledge_data[type]) { |from_knowledge|
|
324
|
+
from_knowledge[:score] > 0 ? ((from_knowledge[:score] + value) / 2.0).round(4) : value.round(4)
|
325
|
+
}
|
326
|
+
}
|
327
|
+
end
|
328
|
+
|
329
|
+
knowledge_data[type].map! { |hash|
|
330
|
+
hash.delete(:counter)
|
331
|
+
hash
|
332
|
+
}
|
333
|
+
|
334
|
+
knowledge_data[type] = knowledge_data[type].sort_by { |hash| hash[:score] }.reverse.take(best_count)
|
335
|
+
|
336
|
+
if identify_identical
|
337
|
+
@predicates_similarity.reduce_identical
|
338
|
+
end
|
339
|
+
|
340
|
+
update_knowledge_base(knowledge_data)
|
341
|
+
end
|
342
|
+
|
343
|
+
###
|
344
|
+
# The method generate simple statistics that contain all predicates that links to literal.
|
345
|
+
# Predicates are grouped by entity class type and also contains count of total occurrence.
|
346
|
+
# Predicates find from best ranked resources.
|
347
|
+
#
|
348
|
+
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
349
|
+
# @param [Fixnum] count Count of best ranked resources
|
350
|
+
def generate_literal_statistics(type = nil, count = 10)
|
351
|
+
unless type
|
352
|
+
type = get_all_classes
|
353
|
+
end
|
354
|
+
|
355
|
+
type = [type] unless type.is_a?(Array)
|
356
|
+
|
357
|
+
type.each_with_index { |entity_type, index|
|
358
|
+
all_properties = {}
|
359
|
+
puts "#{__method__} - start process entity type: #{entity_type} [#{(index / type.size.to_f).round(2)}]" if @console_output
|
360
|
+
entity_type = entity_type.to_s.to_sym
|
361
|
+
|
362
|
+
get_best_ranked_resources(entity_type, count).each { |resource, _|
|
363
|
+
properties = @query.get_all_predicates_by_subject(resource.to_s, true).map { |solution_prop|
|
364
|
+
solution_prop[:property].to_s
|
365
|
+
} || []
|
366
|
+
|
367
|
+
properties.uniq.each { |prop|
|
368
|
+
next if Predicate.unimportant?(prop)
|
369
|
+
all_properties[entity_type] ||= {}
|
370
|
+
all_properties[entity_type][prop] ||= 0
|
371
|
+
all_properties[entity_type][prop] += 1
|
372
|
+
}
|
373
|
+
|
374
|
+
}
|
375
|
+
|
376
|
+
update_global_statistic(all_properties)
|
377
|
+
}
|
378
|
+
end
|
379
|
+
|
380
|
+
###
|
381
|
+
# The method load all defined entity class types by http://mappings.dbpedia.org/server/ontology/classes/
|
382
|
+
#
|
383
|
+
# @param [String] path
|
384
|
+
#
|
385
|
+
# @return [Hash] classes
|
386
|
+
def get_all_classes(path = File.join(__dir__,'../knowledge/classes_hierarchy.json'))
|
387
|
+
data = ensure_load_json(path, {})
|
388
|
+
HashHelper.recursive_map_keys(data)
|
389
|
+
end
|
390
|
+
|
391
|
+
|
392
|
+
private
|
393
|
+
|
394
|
+
###
|
395
|
+
# The method helps to continue of process find links in nif dataset.
|
396
|
+
#
|
397
|
+
# @param [String] resource_uri
|
398
|
+
# @param [Hash] actual_resource_data Part data extracted from nif dataset for given resource_uri
|
399
|
+
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
400
|
+
#
|
401
|
+
# @return [Hash] out
|
402
|
+
def find_relations(resource_uri, actual_resource_data, type)
|
403
|
+
out = {
|
404
|
+
sections: {},
|
405
|
+
relations: []
|
406
|
+
}
|
407
|
+
|
408
|
+
puts "- properties to find size[#{actual_resource_data.size}]" if @console_output
|
409
|
+
|
410
|
+
time = Benchmark.realtime {
|
411
|
+
out[:relations] = actual_resource_data.map! { |resource_data|
|
412
|
+
section_group = resource_data[:section].scan(SCAN_REGEXP[:group])
|
413
|
+
|
414
|
+
type_key = resource_data[:section].force_encoding('utf-8')
|
415
|
+
|
416
|
+
out[:sections][type_key] ||= {
|
417
|
+
type: section_group[0][0],
|
418
|
+
from: section_group[0][1].to_i,
|
419
|
+
to: section_group[0][2].to_i,
|
420
|
+
}
|
421
|
+
|
422
|
+
result = get_predicates_by_link(resource_uri, resource_data[:link], type)
|
423
|
+
|
424
|
+
resource_data[:properties] = result[:properties]
|
425
|
+
resource_data[:strict_properties] = result[:strict_properties]
|
426
|
+
|
427
|
+
resource_data
|
428
|
+
}.compact || []
|
429
|
+
}
|
430
|
+
|
431
|
+
out[:time] = time.round(2)
|
432
|
+
|
433
|
+
puts "- properties found in #{out[:time]}" if @console_output
|
434
|
+
|
435
|
+
out
|
436
|
+
end
|
437
|
+
|
438
|
+
|
439
|
+
###
|
440
|
+
# The method generate file path for given resource URI.
|
441
|
+
# Also ensure to exist sub directory by resource entity type.
|
442
|
+
#
|
443
|
+
# @param [String] resource_uri
|
444
|
+
# @param [String] type
|
445
|
+
#
|
446
|
+
# @return [String] resource_file_path
|
447
|
+
def get_resource_file_path(resource_uri, type)
|
448
|
+
type = type.split('/').last
|
449
|
+
resource_name = resource_uri.split('/').last
|
450
|
+
|
451
|
+
dir_path = "#{@results_dir_path}/#{type}"
|
452
|
+
Dir.mkdir(dir_path) unless Dir.exist?(dir_path)
|
453
|
+
|
454
|
+
"#{dir_path}/#{StringHelper.get_clear_file_path(resource_name)}.json"
|
455
|
+
end
|
456
|
+
|
457
|
+
###
|
458
|
+
# The method helps update found predicates to stored links.
|
459
|
+
#
|
460
|
+
# @param [String] resource_uri
|
461
|
+
# @param [String] type
|
462
|
+
#
|
463
|
+
# @return [Array<Hash>] old_data, new_data
|
464
|
+
def update_nif_file_properties(resource_uri, type)
|
465
|
+
if block_given?
|
466
|
+
path = get_resource_file_path(resource_uri, type)
|
467
|
+
old_data = ensure_load_json(path, {}, symbolize_names: true)
|
468
|
+
|
469
|
+
new_data = old_data.dup
|
470
|
+
|
471
|
+
time = Benchmark.realtime {
|
472
|
+
new_data[:nif_data] = old_data[:nif_data].map { |hash|
|
473
|
+
actual_link = hash[:link].to_sym
|
474
|
+
|
475
|
+
result = yield actual_link
|
476
|
+
|
477
|
+
hash[:strict_properties] = result[:strict_properties] if result[:strict_properties]
|
478
|
+
hash[:properties] = result[:properties] if result[:properties]
|
479
|
+
|
480
|
+
hash
|
481
|
+
}
|
482
|
+
}
|
483
|
+
|
484
|
+
new_data[:process_time][:relations_find] = time.round(2)
|
485
|
+
|
486
|
+
File.write(path, JSON.pretty_generate(new_data))
|
487
|
+
return old_data, new_data
|
488
|
+
end
|
489
|
+
end
|
490
|
+
|
491
|
+
###
|
492
|
+
# The method in yield block give founded hash for required property.
|
493
|
+
# This hash contains counter, score and also all identical properties.
|
494
|
+
# At the end update score that was get from yield block as return value.
|
495
|
+
#
|
496
|
+
# @param [String] property
|
497
|
+
# @param [Array<Hash>] this_knowledge_data
|
498
|
+
#
|
499
|
+
# @yield param found
|
500
|
+
# @yield return score
|
501
|
+
def prepare_property_to_knowledge(property, this_knowledge_data)
|
502
|
+
property = property.to_s
|
503
|
+
|
504
|
+
this_knowledge_data ||= []
|
505
|
+
found = this_knowledge_data.find { |data| data[:predicates].include?(property) }
|
506
|
+
|
507
|
+
if found.nil? || found.empty?
|
508
|
+
# add new
|
509
|
+
|
510
|
+
identical_properties = @predicates_similarity.find_identical(property)
|
511
|
+
|
512
|
+
found = {
|
513
|
+
counter: 0,
|
514
|
+
score: 0.0,
|
515
|
+
predicates: identical_properties || [property.to_s]
|
516
|
+
}
|
517
|
+
|
518
|
+
this_knowledge_data << found
|
519
|
+
end
|
520
|
+
|
521
|
+
new_score = yield found
|
522
|
+
|
523
|
+
|
524
|
+
found[:score] = new_score
|
525
|
+
end
|
526
|
+
|
527
|
+
###
|
528
|
+
# The method delete all resources that already has created result file
|
529
|
+
#
|
530
|
+
# @param [Hash{resource=>type}] resources
|
531
|
+
def keep_unloaded(resources)
|
532
|
+
resources.delete_if { |resource, values|
|
533
|
+
dir_path = "#{@results_dir_path}/#{values[:type]}"
|
534
|
+
resource_name = resource.split('/').last
|
535
|
+
File.exists?("#{dir_path}/#{StringHelper.get_clear_file_path(resource_name)}.json")
|
536
|
+
}
|
537
|
+
end
|
538
|
+
|
539
|
+
###
|
540
|
+
# The method keep all resources that already has created result file
|
541
|
+
#
|
542
|
+
# @param [Hash{resource=>type}] resources
|
543
|
+
def keep_loaded(resources)
|
544
|
+
resources.keep_if { |resource, values|
|
545
|
+
dir_path = "#{@results_dir_path}/#{values[:type]}"
|
546
|
+
resource_name = resource.split('/').last
|
547
|
+
File.exists?("#{dir_path}/#{StringHelper.get_clear_file_path(resource_name)}.json")
|
548
|
+
}
|
549
|
+
end
|
550
|
+
|
551
|
+
###
|
552
|
+
# The method allow to update knowledge base by every entity class type.
|
553
|
+
#
|
554
|
+
# @param [Hash] new_data
|
555
|
+
def update_knowledge_base(new_data)
|
556
|
+
path = "#{@results_dir_path}/knowledge_base.json"
|
557
|
+
old_data = ensure_load_json(path, {}, symbolize_names: true)
|
558
|
+
File.write(path, JSON.pretty_generate(old_data.merge(new_data)))
|
559
|
+
end
|
560
|
+
|
561
|
+
###
|
562
|
+
# The method allow to update global statistic by every entity class type.
|
563
|
+
#
|
564
|
+
# @param [Hash] new_data
|
565
|
+
def update_global_statistic(new_data)
|
566
|
+
path = "#{@results_dir_path}/global_statistic.json"
|
567
|
+
old_data = ensure_load_json(path, {}, symbolize_names: true)
|
568
|
+
File.write(path, JSON.pretty_generate(old_data.merge(new_data)))
|
569
|
+
end
|
570
|
+
|
571
|
+
###
|
572
|
+
# The method returns global properties for given entity class type.
|
573
|
+
#
|
574
|
+
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
575
|
+
#
|
576
|
+
# @return [Hash] global_statistic_by_type
|
577
|
+
def get_global_statistic_by_type(type)
|
578
|
+
type = type.to_s.to_sym
|
579
|
+
path = "#{@results_dir_path}/global_statistic.json"
|
580
|
+
data = ensure_load_json(path, {}, symbolize_names: true)
|
581
|
+
data[type]
|
582
|
+
end
|
583
|
+
|
584
|
+
###
|
585
|
+
# The method helps to load json file.
|
586
|
+
#
|
587
|
+
# @param [String] file_path
|
588
|
+
# @param [String] def_val If no exist file add values as default.
|
589
|
+
# @param [Hash] json_params JSON.parse params
|
590
|
+
#
|
591
|
+
# @return [Object] json
|
592
|
+
def ensure_load_json(file_path, def_val, json_params = {})
|
593
|
+
if File.exists?(file_path)
|
594
|
+
file_data = File.read(file_path).force_encoding('utf-8')
|
595
|
+
if file_data.size >= 2 # '[]'
|
596
|
+
JSON.parse(file_data, json_params)
|
597
|
+
else
|
598
|
+
def_val
|
599
|
+
end
|
600
|
+
else
|
601
|
+
def_val
|
602
|
+
end
|
603
|
+
end
|
604
|
+
|
605
|
+
|
606
|
+
end
|
607
|
+
end
|
608
|
+
|
609
|
+
end
|