browser_web_data_entity_sumarization 1.0.0beta1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/browser_web_data_entity_sumarization.rb +23 -0
- data/lib/browser_web_data_entity_sumarization/entity_sumarization_nif_parser.rb +59 -0
- data/lib/browser_web_data_entity_sumarization/entity_sumarization_predicate.rb +33 -0
- data/lib/browser_web_data_entity_sumarization/entity_sumarization_predicates_similarity.rb +263 -0
- data/lib/browser_web_data_entity_sumarization/entity_sumarization_statistics.rb +609 -0
- data/lib/browser_web_data_entity_sumarization/sparql_request.rb +74 -0
- data/lib/browser_web_data_entity_sumarization/version.rb +3 -0
- data/lib/config/entity_sumarization_config.rb +48 -0
- data/lib/knowledge/classes_hierarchy.json +906 -0
- data/lib/knowledge/common_properties.json +23 -0
- data/lib/knowledge/entity_classes.json +1 -0
- data/lib/knowledge/knowledge_base.json +40642 -0
- data/lib/utils/cache_helper.rb +69 -0
- data/lib/utils/hash_helper.rb +79 -0
- data/lib/utils/sparql_queries.rb +126 -0
- data/lib/utils/string_helper.rb +31 -0
- metadata +74 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b24bd4b2cd4ae8c908f6549710531303a49c36e5
|
4
|
+
data.tar.gz: 0c48cc2e23703a28dbe262c3535703b87698f63f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 786bfb65d8250af636e9a1187cc7530738e734252d405c53bc10c0f0c03ba0ea4953fca5b32613817c50c4c67200df09ea31ff96f0a2ec8f0c481b53a965e00b
|
7
|
+
data.tar.gz: 3448f20b4aa7b7bd457da33479435fd557ef14c87f13264cf3a99286ecd053511cbf341f0ae2f1cb75a516fe5bf4d9e73bd6a0e46b7de701d93d1938f6851c12
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
require 'sparql/client'
|
3
|
+
require 'benchmark'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
module BrowserWebData
|
7
|
+
module EntitySumarization
|
8
|
+
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
Dir.glob(File.dirname(__FILE__) + '/utils/*.rb').each { |file| require file }
|
13
|
+
Dir.glob(File.dirname(__FILE__) + '/config/*.rb').each { |file| require file }
|
14
|
+
|
15
|
+
# Require all gem scripts by their relative names
|
16
|
+
Dir[File.dirname(__FILE__) + '/browser_web_data_entity_sumarization/**/*.rb'].each do |file|
|
17
|
+
require(file.gsub('\\', '/').split('/lib/').last[0..-4])
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# Core project module
|
5
|
+
module BrowserWebData
|
6
|
+
|
7
|
+
###
|
8
|
+
# Project logic module
|
9
|
+
module EntitySumarization
|
10
|
+
|
11
|
+
###
|
12
|
+
# The class include helpers to retrieve structured nif data from nif lines.
|
13
|
+
class NIFLineParser
|
14
|
+
include BrowserWebData::EntitySumarizationConfig
|
15
|
+
|
16
|
+
###
|
17
|
+
# The method apply scan to recognize resource uri from given nif dataset line.
|
18
|
+
#
|
19
|
+
# @param [String] line
|
20
|
+
#
|
21
|
+
# @return [String] resource_uri
|
22
|
+
# @example resource_uri: "http://dbpedia.org/resource/Captain_EO"
|
23
|
+
def self.parse_resource_uri(line)
|
24
|
+
(line.scan(SCAN_REGEXP[:scan_resource])[0])[0].split('?').first
|
25
|
+
end
|
26
|
+
|
27
|
+
###
|
28
|
+
# The method apply scan to recognize link, anchor, indexes and section from given nif dataset group of 7 lines.
|
29
|
+
#
|
30
|
+
# @param [Array<String>] lines_group
|
31
|
+
#
|
32
|
+
# @return [Hash] nif_data
|
33
|
+
# @example nif_data:
|
34
|
+
# {
|
35
|
+
# link: "http://dbpedia.org/resource/Science_fiction_film",
|
36
|
+
# anchor: "science fiction film",
|
37
|
+
# indexes: ["33", "53"],
|
38
|
+
# section: "paragraph_0_419"
|
39
|
+
# }
|
40
|
+
def self.parse_line_group(lines_group)
|
41
|
+
begin_index = lines_group[2].scan(SCAN_REGEXP[:begin_index])[0]
|
42
|
+
end_index = lines_group[3].scan(SCAN_REGEXP[:end_index])[0]
|
43
|
+
target_resource_link = lines_group[5].scan(SCAN_REGEXP[:target_resource_link])[0]
|
44
|
+
section = lines_group[4].scan(SCAN_REGEXP[:section])[0]
|
45
|
+
anchor = lines_group[6].scan(SCAN_REGEXP[:anchor])[0]
|
46
|
+
|
47
|
+
{
|
48
|
+
link: target_resource_link[1].force_encoding('utf-8'),
|
49
|
+
anchor: anchor[1].force_encoding('utf-8'),
|
50
|
+
indexes: [begin_index[1], end_index[1]],
|
51
|
+
section: section[0].split('=')[1]
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# Core project module
|
5
|
+
module BrowserWebData
|
6
|
+
|
7
|
+
###
|
8
|
+
# Project logic module
|
9
|
+
module EntitySumarization
|
10
|
+
|
11
|
+
|
12
|
+
###
|
13
|
+
# The class include helper methods.
|
14
|
+
# (todo definition of predicate instance)
|
15
|
+
class Predicate
|
16
|
+
include BrowserWebData::EntitySumarizationConfig
|
17
|
+
|
18
|
+
###
|
19
|
+
# The method helps identify unimportant predicate by constants.
|
20
|
+
#
|
21
|
+
# @param [String] property
|
22
|
+
#
|
23
|
+
# @return [TrueClass, FalseClass] result
|
24
|
+
def self.unimportant?(property)
|
25
|
+
property = property.to_s
|
26
|
+
NO_SENSE_PROPERTIES.include?(property) || COMMON_PROPERTIES.include?(property)
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,263 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# Core project module
|
5
|
+
module BrowserWebData
|
6
|
+
|
7
|
+
###
|
8
|
+
# Project logic module
|
9
|
+
module EntitySumarization
|
10
|
+
|
11
|
+
###
|
12
|
+
# The class include methods to identify identical predicates
|
13
|
+
class PredicatesSimilarity
|
14
|
+
include BrowserWebData::EntitySumarizationConfig
|
15
|
+
|
16
|
+
def initialize(results_dir_path, identical_limit = IDENTICAL_PROPERTY_LIMIT, console_output = false)
|
17
|
+
@results_dir_path = results_dir_path
|
18
|
+
@console_output = console_output
|
19
|
+
@identical_limit = identical_limit
|
20
|
+
|
21
|
+
@query = SPARQLRequest.new
|
22
|
+
|
23
|
+
load_identical_predicates
|
24
|
+
load_different_predicates
|
25
|
+
end
|
26
|
+
|
27
|
+
###
|
28
|
+
# The method return key of identical predicates
|
29
|
+
#
|
30
|
+
# @param [Array<String>] predicates
|
31
|
+
#
|
32
|
+
# @return [String] key
|
33
|
+
def self.get_key(predicates)
|
34
|
+
predicates = [predicates] unless predicates.is_a?(Array)
|
35
|
+
"<#{predicates.join('><')}>" if predicates && !predicates.empty?
|
36
|
+
end
|
37
|
+
|
38
|
+
###
|
39
|
+
# The method return identical predicates by key
|
40
|
+
#
|
41
|
+
# @param [String] key
|
42
|
+
#
|
43
|
+
# @return [Array<String>] predicates
|
44
|
+
def self.parse_key(key)
|
45
|
+
key.to_s.scan(SCAN_REGEXP[:identical_key]).reduce(:+)
|
46
|
+
end
|
47
|
+
|
48
|
+
###
|
49
|
+
# The method verify every combination of two predicates.
|
50
|
+
# Method store identify combination in two files identical_predicates.json and different_predicates.json
|
51
|
+
# files contains Array of combination keys
|
52
|
+
#
|
53
|
+
# @param [Array<String>] predicates
|
54
|
+
def identify_identical_predicates(predicates, identical_limit = @identical_limit)
|
55
|
+
@temp_counts ||= {}
|
56
|
+
|
57
|
+
predicates.combination(2).each { |values|
|
58
|
+
|
59
|
+
already_mark_same = find_identical(values)
|
60
|
+
already_mark_different = find_different(values)
|
61
|
+
|
62
|
+
if already_mark_same.nil? && already_mark_different.nil?
|
63
|
+
|
64
|
+
# in case of dbpedia ontology vs. property
|
65
|
+
# automatically became identical
|
66
|
+
unless try_auto_identical(values)
|
67
|
+
|
68
|
+
unless @temp_counts[values[0]]
|
69
|
+
@temp_counts[values[0]] = @query.get_count_of_identical_predicates(values[0])
|
70
|
+
end
|
71
|
+
|
72
|
+
unless @temp_counts[values[1]]
|
73
|
+
@temp_counts[values[1]] = @query.get_count_of_identical_predicates(values[1])
|
74
|
+
end
|
75
|
+
|
76
|
+
x = @temp_counts[values[0]]
|
77
|
+
y = @temp_counts[values[1]]
|
78
|
+
z = @query.get_count_of_identical_predicates(values)
|
79
|
+
|
80
|
+
identical_level = z / [x, y].max
|
81
|
+
|
82
|
+
if identical_level >= identical_limit
|
83
|
+
puts " - result[#{identical_level}] z[#{z}] x[#{x}] y[#{y}] #{values.inspect}" if @console_output
|
84
|
+
add_identical(values)
|
85
|
+
else
|
86
|
+
add_different(values)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
true
|
93
|
+
}
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
###
|
98
|
+
# The method helps to recognize if is already marked as identical properties
|
99
|
+
#
|
100
|
+
# @param [Array<String>, String] value
|
101
|
+
#
|
102
|
+
# @return [String, NilClass]
|
103
|
+
def find_identical(value)
|
104
|
+
raise RuntimeError.new('No support identify identical for more than 2 predicates.') if value.is_a?(Array) && value.size >2
|
105
|
+
|
106
|
+
predicates_key = case value
|
107
|
+
when Array
|
108
|
+
value = value.map { |v| PredicatesSimilarity.get_key(v) }
|
109
|
+
@identical_predicates.find { |p|
|
110
|
+
p[value[0]] && p[value[1]]
|
111
|
+
}
|
112
|
+
else
|
113
|
+
value = PredicatesSimilarity.get_key(value)
|
114
|
+
@identical_predicates.find { |p|
|
115
|
+
p[value]
|
116
|
+
}
|
117
|
+
end
|
118
|
+
|
119
|
+
PredicatesSimilarity.parse_key(predicates_key)
|
120
|
+
end
|
121
|
+
|
122
|
+
###
|
123
|
+
# The method helps to recognize if is already marked as different properties
|
124
|
+
#
|
125
|
+
# @param [Array<String>, String] value
|
126
|
+
#
|
127
|
+
# @return [String, NilClass]
|
128
|
+
def find_different(value)
|
129
|
+
raise RuntimeError.new('No support identify identical for more than 2 predicates.') if value.is_a?(Array) && value.size >2
|
130
|
+
|
131
|
+
key = case value
|
132
|
+
when Array
|
133
|
+
value = value.map { |v| PredicatesSimilarity.get_key(v) }
|
134
|
+
@different_predicates.find { |p| p[value[0]] && p[value[1]] }
|
135
|
+
else
|
136
|
+
value = PredicatesSimilarity.get_key(value)
|
137
|
+
@different_predicates.find { |p| p[value] }
|
138
|
+
end
|
139
|
+
|
140
|
+
PredicatesSimilarity.parse_key(key)
|
141
|
+
end
|
142
|
+
|
143
|
+
def add_identical(values)
|
144
|
+
values = values.map { |p| p.to_s }.uniq.sort
|
145
|
+
group_key = PredicatesSimilarity.get_key(values)
|
146
|
+
|
147
|
+
unless @identical_predicates.include?(group_key)
|
148
|
+
@identical_predicates << group_key
|
149
|
+
store_identical_properties
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def add_different(values)
|
154
|
+
values = values.map { |p| p.to_s }.uniq.sort
|
155
|
+
group_key = PredicatesSimilarity.get_key(values)
|
156
|
+
|
157
|
+
unless @different_predicates.include?(group_key)
|
158
|
+
@different_predicates << group_key
|
159
|
+
|
160
|
+
@new_diff_counter ||= 0
|
161
|
+
@new_diff_counter += 1
|
162
|
+
|
163
|
+
if @new_diff_counter > 100
|
164
|
+
store_different_predicates
|
165
|
+
@new_diff_counter = 0
|
166
|
+
end
|
167
|
+
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def try_auto_identical(values)
|
172
|
+
group_key = PredicatesSimilarity.get_key(values)
|
173
|
+
|
174
|
+
temp = values.map { |val| val.to_s.split('/').last }.uniq
|
175
|
+
if temp.size == 1 && group_key['property/'] && group_key['ontology/']
|
176
|
+
add_identical(values)
|
177
|
+
true
|
178
|
+
else
|
179
|
+
false
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
|
184
|
+
###
|
185
|
+
# The method helps to reduce identical predicates by join of common predicate
|
186
|
+
def reduce_identical
|
187
|
+
new_identical = []
|
188
|
+
|
189
|
+
@identical_predicates.each { |key|
|
190
|
+
values = PredicatesSimilarity.parse_key(key)
|
191
|
+
next if new_identical.find { |v| !(v & values).empty? }
|
192
|
+
|
193
|
+
## find nodes with values predicates
|
194
|
+
values = recursive_find_identical(key, values)
|
195
|
+
|
196
|
+
new_identical << values.uniq.sort
|
197
|
+
}
|
198
|
+
|
199
|
+
@identical_predicates = new_identical.map { |v| PredicatesSimilarity.get_key(v) }
|
200
|
+
|
201
|
+
store_identical_properties
|
202
|
+
end
|
203
|
+
|
204
|
+
def recursive_find_identical(keys, values)
|
205
|
+
keys = [keys] unless keys.is_a?(Array)
|
206
|
+
|
207
|
+
@identical_predicates.each { |this_key|
|
208
|
+
next if keys.include?(this_key)
|
209
|
+
temp = PredicatesSimilarity.parse_key(this_key)
|
210
|
+
|
211
|
+
unless (temp & values).empty?
|
212
|
+
keys << this_key
|
213
|
+
return recursive_find_identical(keys, (values + temp).uniq)
|
214
|
+
end
|
215
|
+
}
|
216
|
+
|
217
|
+
values
|
218
|
+
end
|
219
|
+
|
220
|
+
|
221
|
+
private
|
222
|
+
|
223
|
+
|
224
|
+
def load_identical_predicates
|
225
|
+
unless @identical_predicates
|
226
|
+
file_path = "#{@results_dir_path}/identical_predicates.json"
|
227
|
+
@identical_predicates = ensure_load_json(file_path, [])
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
def load_different_predicates
|
232
|
+
unless @different_predicates
|
233
|
+
file_path = "#{@results_dir_path}/different_predicates.json"
|
234
|
+
@different_predicates = ensure_load_json(file_path, [])
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
def store_identical_properties
|
239
|
+
File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
|
240
|
+
end
|
241
|
+
|
242
|
+
def store_different_predicates
|
243
|
+
File.write("#{@results_dir_path}/different_predicates.json", JSON.generate(@different_predicates))
|
244
|
+
end
|
245
|
+
|
246
|
+
def ensure_load_json(file_path, def_val, json_params = {})
|
247
|
+
if File.exists?(file_path)
|
248
|
+
file_data = File.read(file_path).force_encoding('utf-8')
|
249
|
+
if file_data.size >= 2 # '[]'
|
250
|
+
JSON.parse(file_data, json_params)
|
251
|
+
else
|
252
|
+
def_val
|
253
|
+
end
|
254
|
+
else
|
255
|
+
def_val
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
end
|
260
|
+
|
261
|
+
end
|
262
|
+
|
263
|
+
end
|
@@ -0,0 +1,609 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# Core project module
|
5
|
+
module BrowserWebData
|
6
|
+
|
7
|
+
###
|
8
|
+
# Project logic module
|
9
|
+
module EntitySumarization
|
10
|
+
|
11
|
+
###
|
12
|
+
# Statistic class allow to find, collect and generate knowledge of entity sumarization.
|
13
|
+
# Entity sumarization is based on use dataset of NLP Interchange Format (NIF).
|
14
|
+
# For example datasets from http://wiki.dbpedia.org/nif-abstract-datasets
|
15
|
+
# Knowledge is generate by information in DBpedia.
|
16
|
+
class Statistic
|
17
|
+
include BrowserWebData::EntitySumarizationConfig
|
18
|
+
|
19
|
+
attr_reader :nif_file_path, :results_dir_path
|
20
|
+
|
21
|
+
###
|
22
|
+
# Create new instance.
|
23
|
+
#
|
24
|
+
# @param [String] nif_dataset_path
|
25
|
+
# @param [String] results_dir_path
|
26
|
+
# @param [TrueClass, FalseClass] console_output Allow puts info to console. Default is false.
|
27
|
+
def initialize(nif_dataset_path, results_dir_path = File.join(__dir__, '../../results'), console_output = false)
|
28
|
+
nif_dataset_path = nif_dataset_path.gsub('\\', '/')
|
29
|
+
results_dir_path = results_dir_path.gsub('\\', '/').chomp('/')
|
30
|
+
|
31
|
+
return false unless File.exists?(nif_dataset_path)
|
32
|
+
return false unless File.exists?(results_dir_path)
|
33
|
+
|
34
|
+
@nif_file_path = nif_dataset_path.gsub('\\', '/')
|
35
|
+
@results_dir_path = results_dir_path.gsub('\\', '/').chomp('/')
|
36
|
+
@console_output = console_output
|
37
|
+
|
38
|
+
@query = SPARQLRequest.new
|
39
|
+
@predicates_similarity = PredicatesSimilarity.new(@results_dir_path)
|
40
|
+
end
|
41
|
+
|
42
|
+
###
|
43
|
+
# The method find resource links in given nif file dataset.
|
44
|
+
#
|
45
|
+
# @param [Hash] params
|
46
|
+
# @option params [Array<String>, String] :entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
|
47
|
+
# @option params [Fixnum] :entity_count Best ranked resources by every entity type.
|
48
|
+
# @option params [Fixnum] :best_score_count Count of result predicates to keep.
|
49
|
+
# @option params [FalseClass, TruesClass] :demand_reload
|
50
|
+
# @option params [FalseClass, TruesClass] :identity_identical_predicates
|
51
|
+
def create_by_nif_dataset(params)
|
52
|
+
params[:entity_types] = [params[:entity_types]] unless params[:entity_types].is_a?(Array)
|
53
|
+
|
54
|
+
generate_statistics_from_nif(params[:entity_types], params[:entity_count], params[:demand_reload])
|
55
|
+
|
56
|
+
params[:entity_types].each { |type|
|
57
|
+
generate_knowledge_base(type, params[:best_score_count], params[:identity_identical_predicates])
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
###
|
62
|
+
# The method return list of best ranked resources by required entity types.
|
63
|
+
#
|
64
|
+
# @param [Array<String>, String] entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
|
65
|
+
# @param [Fixnum] count Count of best ranked resources
|
66
|
+
#
|
67
|
+
# @return [Hash] resources
|
68
|
+
def get_best_ranked_resources(entity_types, count = 10)
|
69
|
+
resources = {}
|
70
|
+
entity_types = [entity_types] unless entity_types.is_a?(Array)
|
71
|
+
|
72
|
+
entity_types.each { |type|
|
73
|
+
top_ranked_entities = @query.get_resources_by_dbpedia_page_rank(type, count)
|
74
|
+
|
75
|
+
top_ranked_entities.each { |solution|
|
76
|
+
resources[solution.entity.value] = {type: type, rank: solution.rank.value.to_f}
|
77
|
+
}
|
78
|
+
}
|
79
|
+
|
80
|
+
resources
|
81
|
+
end
|
82
|
+
|
83
|
+
###
|
84
|
+
# The method find links in given nif dataset. After find collect relations #find_relations.
|
85
|
+
# For each resource generate file in @results_dir_path.
|
86
|
+
#
|
87
|
+
# @param [Array<String>, String] entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
|
88
|
+
# @param [Fixnum] count Count of best ranked resources
|
89
|
+
# @param [FalseClass, TruesClass] demand_reload
|
90
|
+
def generate_statistics_from_nif(entity_types, count = 10, demand_reload = false)
|
91
|
+
resources = get_best_ranked_resources(entity_types, count)
|
92
|
+
|
93
|
+
resources = keep_unloaded(resources) unless demand_reload
|
94
|
+
|
95
|
+
actual_resource_data = []
|
96
|
+
lines_group = []
|
97
|
+
|
98
|
+
begin
|
99
|
+
time_start = Time.now
|
100
|
+
nif_file = File.open(@nif_file_path, 'r')
|
101
|
+
line = nif_file.readline
|
102
|
+
|
103
|
+
until nif_file.eof?
|
104
|
+
line = nif_file.readline
|
105
|
+
|
106
|
+
if lines_group.size == 7
|
107
|
+
# evaulate group (7 lines)
|
108
|
+
this_resource_uri = NIFLineParser.parse_resource_uri(lines_group[0])
|
109
|
+
|
110
|
+
if resources.keys.include?(this_resource_uri)
|
111
|
+
# process group, is requested
|
112
|
+
resource_uri = this_resource_uri
|
113
|
+
actual_resource_data << NIFLineParser.parse_line_group(lines_group)
|
114
|
+
|
115
|
+
elsif !actual_resource_data.empty?
|
116
|
+
# resource changed, process actual_resource_data
|
117
|
+
resource_hash = resources.delete(resource_uri)
|
118
|
+
type = resource_hash[:type]
|
119
|
+
|
120
|
+
this_time = (Time.now - time_start).round(2)
|
121
|
+
puts "\n#{resource_uri}\n- nif found in #{this_time}\n- resources to find #{resources.size}" if @console_output
|
122
|
+
|
123
|
+
result_relations = find_relations(resource_uri, actual_resource_data, type)
|
124
|
+
generate_result_file(resource_uri, type, result_relations, this_time)
|
125
|
+
|
126
|
+
break if resources.empty?
|
127
|
+
|
128
|
+
actual_resource_data = []
|
129
|
+
time_start = Time.now
|
130
|
+
end
|
131
|
+
|
132
|
+
# start new group
|
133
|
+
lines_group = [line]
|
134
|
+
else
|
135
|
+
|
136
|
+
# join line to group
|
137
|
+
lines_group << line
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
141
|
+
|
142
|
+
ensure
|
143
|
+
nif_file.close if nif_file && !nif_file.closed?
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
###
|
148
|
+
# The method helps to recollect relations by already generated result files.
|
149
|
+
#
|
150
|
+
# @param [Array<String>, String] entity_types Types from http://mappings.dbpedia.org/server/ontology/classes/
|
151
|
+
# @param [Fixnum] count Count of best ranked resources
|
152
|
+
def refresh_statistics_in_files(entity_types, count = 10)
|
153
|
+
resources = get_best_ranked_resources(entity_types, count)
|
154
|
+
|
155
|
+
resources = keep_loaded(resources)
|
156
|
+
|
157
|
+
resources.each { |resource_uri, resource_info|
|
158
|
+
puts "_____ #{resource_uri} _____" if @console_output
|
159
|
+
|
160
|
+
update_nif_file_properties(resource_uri, resource_info[:type]) { |link|
|
161
|
+
get_predicates_by_link(resource_uri, link, resource_info[:type])
|
162
|
+
}
|
163
|
+
}
|
164
|
+
|
165
|
+
end
|
166
|
+
|
167
|
+
|
168
|
+
###
|
169
|
+
# The method find predicates by given link.
|
170
|
+
# Find strict predicates that are in relation: <resource> ?predicate <link> .
|
171
|
+
# Find predicates that are in relation: ?subject a <type> . ?subject ?predicate <link>
|
172
|
+
#
|
173
|
+
# @param [String] resource_uri Resource for which will be find strict properties
|
174
|
+
# @param [String] link Link that has some importance to resource or entity type.
|
175
|
+
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
176
|
+
#
|
177
|
+
# @return [Hash] result
|
178
|
+
def get_predicates_by_link(resource_uri, link, type)
|
179
|
+
properties = {type => {}}
|
180
|
+
strict_properties = {type => {}}
|
181
|
+
|
182
|
+
@query.get_all_predicates_by_subject_object(resource_uri, link).each { |solution|
|
183
|
+
predicate = solution.to_h
|
184
|
+
property = predicate[:property].to_s.force_encoding('utf-8')
|
185
|
+
|
186
|
+
next if Predicate.unimportant?(property)
|
187
|
+
|
188
|
+
count = @query.get_count_predicate_by_entity(type, property)[0].to_h[:count].to_f
|
189
|
+
strict_properties[type][property] = count if count > 0
|
190
|
+
}
|
191
|
+
|
192
|
+
@query.get_all_predicates_by_object(link).each { |solution|
|
193
|
+
predicate = solution.to_h
|
194
|
+
property = predicate[:property].to_s.force_encoding('utf-8')
|
195
|
+
|
196
|
+
next if Predicate.unimportant?(property) || strict_properties[type][property]
|
197
|
+
|
198
|
+
count = @query.get_count_predicate_by_entity(type, property)[0].to_h[:count].to_f
|
199
|
+
properties[type][property] = count if count > 0
|
200
|
+
}
|
201
|
+
|
202
|
+
|
203
|
+
{properties: properties, strict_properties: strict_properties}
|
204
|
+
end
|
205
|
+
|
206
|
+
###
|
207
|
+
# The method helps to store founded information from nif for given resource.
|
208
|
+
#
|
209
|
+
# @param [String] resource_uri
|
210
|
+
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
211
|
+
# @param [Hsah] result_relations Hash generated by method #find_relations
|
212
|
+
# @option result_relations [Hash] :sections Contains key 'section_type' value 'position'
|
213
|
+
# @option result_relations [Array<Hash>] :relations Hashes generated by method #get_predicates_by_link
|
214
|
+
#
|
215
|
+
# @param [Float] this_time Relative time of find in nif dataset.
|
216
|
+
def generate_result_file(resource_uri, type, result_relations, this_time)
|
217
|
+
section_degradation = result_relations[:sections].map { |section_type, position|
|
218
|
+
index = result_relations[:sections].keys.index(section_type)
|
219
|
+
|
220
|
+
# recognize value of degradation by relative position paragraphs in document
|
221
|
+
position[:degradation] = 1 - ((index / result_relations[:sections].size) / 10.0)
|
222
|
+
|
223
|
+
{section_type => position}
|
224
|
+
}.reduce(:merge)
|
225
|
+
|
226
|
+
total_size = section_degradation.max_by { |_, v| v[:to] }[1][:to].to_f
|
227
|
+
|
228
|
+
result_nif_data = result_relations[:relations].map { |relation|
|
229
|
+
paragraph_position = section_degradation[relation[:section]]
|
230
|
+
|
231
|
+
# weight is lowest by relative distance from document start
|
232
|
+
position_weight = (1 - ((relation[:indexes][0].to_i) / total_size))
|
233
|
+
# weight is also degraded by index of paragraph
|
234
|
+
relation[:weight] = (position_weight * paragraph_position[:degradation]).round(4)
|
235
|
+
|
236
|
+
relation
|
237
|
+
}
|
238
|
+
|
239
|
+
result = {
|
240
|
+
process_time: {nif_find: this_time, relations_find: result_relations[:time]},
|
241
|
+
resource_uri: resource_uri,
|
242
|
+
nif_data: result_nif_data
|
243
|
+
}
|
244
|
+
|
245
|
+
result_path = get_resource_file_path(resource_uri, type)
|
246
|
+
File.open(result_path, 'w:utf-8') { |f| f << JSON.pretty_generate(result) }
|
247
|
+
end
|
248
|
+
|
249
|
+
###
|
250
|
+
# The method process all generated result files from nif dataset (by entity class type)
|
251
|
+
# to one result knowledge base file.
|
252
|
+
#
|
253
|
+
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
254
|
+
# @param [Fixnum] best_count Define max count of properties that will be assign to entity class type.
|
255
|
+
# @param [TrueClass, FalseClass] identify_identical Flag for process identify and group identical properties as one item.
|
256
|
+
def generate_knowledge_base(type, best_count = 20, identify_identical = true)
|
257
|
+
puts "_____ #{type} _____" if @console_output
|
258
|
+
files = Dir.glob("#{@results_dir_path}/#{type}/*.json")
|
259
|
+
type = type.to_s.to_sym
|
260
|
+
|
261
|
+
knowledge_data = {type => []}
|
262
|
+
|
263
|
+
files.each { |file_path|
|
264
|
+
puts "- calculate #{file_path}" if @console_output
|
265
|
+
file_data = JSON.parse(File.read(file_path).force_encoding('utf-8'), symbolize_names: true)
|
266
|
+
|
267
|
+
if identify_identical
|
268
|
+
file_data[:nif_data].each { |data|
|
269
|
+
all_properties = data[:properties][type].keys + ((data[:strict_properties]||{})[type] || {}).keys.uniq
|
270
|
+
@predicates_similarity.identify_identical_predicates(all_properties)
|
271
|
+
}
|
272
|
+
end
|
273
|
+
|
274
|
+
file_data[:nif_data].each { |found|
|
275
|
+
|
276
|
+
properties = found[:properties][type.to_sym]
|
277
|
+
strict_properties = (found[:strict_properties] ||{})[type] || {}
|
278
|
+
weight = found[:weight]
|
279
|
+
|
280
|
+
strict_properties.each { |property, count|
|
281
|
+
property = property.to_s
|
282
|
+
value = count.to_i * weight
|
283
|
+
|
284
|
+
prepare_property_to_knowledge(property, knowledge_data[type]) { |from_knowledge|
|
285
|
+
old_score = from_knowledge[:score] * from_knowledge[:counter]
|
286
|
+
from_knowledge[:counter] += 1
|
287
|
+
(old_score + value) / from_knowledge[:counter]
|
288
|
+
}
|
289
|
+
}
|
290
|
+
|
291
|
+
properties.each { |property, count|
|
292
|
+
property = property.to_s
|
293
|
+
value = count.to_i * weight
|
294
|
+
|
295
|
+
prepare_property_to_knowledge(property, knowledge_data[type]) { |from_knowledge|
|
296
|
+
old_score = from_knowledge[:score] * from_knowledge[:counter]
|
297
|
+
from_knowledge[:counter] += 1
|
298
|
+
(old_score + value) / from_knowledge[:counter]
|
299
|
+
}
|
300
|
+
}
|
301
|
+
}
|
302
|
+
|
303
|
+
unless knowledge_data[type].empty?
|
304
|
+
max_weight = knowledge_data[type].max_by { |data| data[:score] }[:score]
|
305
|
+
knowledge_data[type] = knowledge_data[type].map { |hash|
|
306
|
+
hash[:score] = (hash[:score] / max_weight).round(4)
|
307
|
+
hash
|
308
|
+
}
|
309
|
+
end
|
310
|
+
}
|
311
|
+
|
312
|
+
global_properties = get_global_statistic_by_type(type) || {}
|
313
|
+
if identify_identical
|
314
|
+
@predicates_similarity.identify_identical_predicates(global_properties.keys)
|
315
|
+
end
|
316
|
+
|
317
|
+
if global_properties.size > 0
|
318
|
+
max_count = global_properties.max_by { |_, count| count }[1].to_f
|
319
|
+
global_properties.each { |property, count|
|
320
|
+
|
321
|
+
value = count / max_count
|
322
|
+
|
323
|
+
prepare_property_to_knowledge(property, knowledge_data[type]) { |from_knowledge|
|
324
|
+
from_knowledge[:score] > 0 ? ((from_knowledge[:score] + value) / 2.0).round(4) : value.round(4)
|
325
|
+
}
|
326
|
+
}
|
327
|
+
end
|
328
|
+
|
329
|
+
knowledge_data[type].map! { |hash|
|
330
|
+
hash.delete(:counter)
|
331
|
+
hash
|
332
|
+
}
|
333
|
+
|
334
|
+
knowledge_data[type] = knowledge_data[type].sort_by { |hash| hash[:score] }.reverse.take(best_count)
|
335
|
+
|
336
|
+
if identify_identical
|
337
|
+
@predicates_similarity.reduce_identical
|
338
|
+
end
|
339
|
+
|
340
|
+
update_knowledge_base(knowledge_data)
|
341
|
+
end
|
342
|
+
|
343
|
+
###
|
344
|
+
# The method generate simple statistics that contain all predicates that links to literal.
|
345
|
+
# Predicates are grouped by entity class type and also contains count of total occurrence.
|
346
|
+
# Predicates find from best ranked resources.
|
347
|
+
#
|
348
|
+
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
349
|
+
# @param [Fixnum] count Count of best ranked resources
|
350
|
+
def generate_literal_statistics(type = nil, count = 10)
|
351
|
+
unless type
|
352
|
+
type = get_all_classes
|
353
|
+
end
|
354
|
+
|
355
|
+
type = [type] unless type.is_a?(Array)
|
356
|
+
|
357
|
+
type.each_with_index { |entity_type, index|
|
358
|
+
all_properties = {}
|
359
|
+
puts "#{__method__} - start process entity type: #{entity_type} [#{(index / type.size.to_f).round(2)}]" if @console_output
|
360
|
+
entity_type = entity_type.to_s.to_sym
|
361
|
+
|
362
|
+
get_best_ranked_resources(entity_type, count).each { |resource, _|
|
363
|
+
properties = @query.get_all_predicates_by_subject(resource.to_s, true).map { |solution_prop|
|
364
|
+
solution_prop[:property].to_s
|
365
|
+
} || []
|
366
|
+
|
367
|
+
properties.uniq.each { |prop|
|
368
|
+
next if Predicate.unimportant?(prop)
|
369
|
+
all_properties[entity_type] ||= {}
|
370
|
+
all_properties[entity_type][prop] ||= 0
|
371
|
+
all_properties[entity_type][prop] += 1
|
372
|
+
}
|
373
|
+
|
374
|
+
}
|
375
|
+
|
376
|
+
update_global_statistic(all_properties)
|
377
|
+
}
|
378
|
+
end
|
379
|
+
|
380
|
+
###
|
381
|
+
# The method load all defined entity class types by http://mappings.dbpedia.org/server/ontology/classes/
|
382
|
+
#
|
383
|
+
# @param [String] path
|
384
|
+
#
|
385
|
+
# @return [Hash] classes
|
386
|
+
def get_all_classes(path = File.join(__dir__,'../knowledge/classes_hierarchy.json'))
|
387
|
+
data = ensure_load_json(path, {})
|
388
|
+
HashHelper.recursive_map_keys(data)
|
389
|
+
end
|
390
|
+
|
391
|
+
|
392
|
+
private
|
393
|
+
|
394
|
+
###
|
395
|
+
# The method helps to continue of process find links in nif dataset.
|
396
|
+
#
|
397
|
+
# @param [String] resource_uri
|
398
|
+
# @param [Hash] actual_resource_data Part data extracted from nif dataset for given resource_uri
|
399
|
+
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
400
|
+
#
|
401
|
+
# @return [Hash] out
|
402
|
+
def find_relations(resource_uri, actual_resource_data, type)
|
403
|
+
out = {
|
404
|
+
sections: {},
|
405
|
+
relations: []
|
406
|
+
}
|
407
|
+
|
408
|
+
puts "- properties to find size[#{actual_resource_data.size}]" if @console_output
|
409
|
+
|
410
|
+
time = Benchmark.realtime {
|
411
|
+
out[:relations] = actual_resource_data.map! { |resource_data|
|
412
|
+
section_group = resource_data[:section].scan(SCAN_REGEXP[:group])
|
413
|
+
|
414
|
+
type_key = resource_data[:section].force_encoding('utf-8')
|
415
|
+
|
416
|
+
out[:sections][type_key] ||= {
|
417
|
+
type: section_group[0][0],
|
418
|
+
from: section_group[0][1].to_i,
|
419
|
+
to: section_group[0][2].to_i,
|
420
|
+
}
|
421
|
+
|
422
|
+
result = get_predicates_by_link(resource_uri, resource_data[:link], type)
|
423
|
+
|
424
|
+
resource_data[:properties] = result[:properties]
|
425
|
+
resource_data[:strict_properties] = result[:strict_properties]
|
426
|
+
|
427
|
+
resource_data
|
428
|
+
}.compact || []
|
429
|
+
}
|
430
|
+
|
431
|
+
out[:time] = time.round(2)
|
432
|
+
|
433
|
+
puts "- properties found in #{out[:time]}" if @console_output
|
434
|
+
|
435
|
+
out
|
436
|
+
end
|
437
|
+
|
438
|
+
|
439
|
+
###
|
440
|
+
# The method generate file path for given resource URI.
|
441
|
+
# Also ensure to exist sub directory by resource entity type.
|
442
|
+
#
|
443
|
+
# @param [String] resource_uri
|
444
|
+
# @param [String] type
|
445
|
+
#
|
446
|
+
# @return [String] resource_file_path
|
447
|
+
def get_resource_file_path(resource_uri, type)
|
448
|
+
type = type.split('/').last
|
449
|
+
resource_name = resource_uri.split('/').last
|
450
|
+
|
451
|
+
dir_path = "#{@results_dir_path}/#{type}"
|
452
|
+
Dir.mkdir(dir_path) unless Dir.exist?(dir_path)
|
453
|
+
|
454
|
+
"#{dir_path}/#{StringHelper.get_clear_file_path(resource_name)}.json"
|
455
|
+
end
|
456
|
+
|
457
|
+
###
|
458
|
+
# The method helps update found predicates to stored links.
|
459
|
+
#
|
460
|
+
# @param [String] resource_uri
|
461
|
+
# @param [String] type
|
462
|
+
#
|
463
|
+
# @return [Array<Hash>] old_data, new_data
|
464
|
+
def update_nif_file_properties(resource_uri, type)
|
465
|
+
if block_given?
|
466
|
+
path = get_resource_file_path(resource_uri, type)
|
467
|
+
old_data = ensure_load_json(path, {}, symbolize_names: true)
|
468
|
+
|
469
|
+
new_data = old_data.dup
|
470
|
+
|
471
|
+
time = Benchmark.realtime {
|
472
|
+
new_data[:nif_data] = old_data[:nif_data].map { |hash|
|
473
|
+
actual_link = hash[:link].to_sym
|
474
|
+
|
475
|
+
result = yield actual_link
|
476
|
+
|
477
|
+
hash[:strict_properties] = result[:strict_properties] if result[:strict_properties]
|
478
|
+
hash[:properties] = result[:properties] if result[:properties]
|
479
|
+
|
480
|
+
hash
|
481
|
+
}
|
482
|
+
}
|
483
|
+
|
484
|
+
new_data[:process_time][:relations_find] = time.round(2)
|
485
|
+
|
486
|
+
File.write(path, JSON.pretty_generate(new_data))
|
487
|
+
return old_data, new_data
|
488
|
+
end
|
489
|
+
end
|
490
|
+
|
491
|
+
###
|
492
|
+
# The method in yield block give founded hash for required property.
|
493
|
+
# This hash contains counter, score and also all identical properties.
|
494
|
+
# At the end update score that was get from yield block as return value.
|
495
|
+
#
|
496
|
+
# @param [String] property
|
497
|
+
# @param [Array<Hash>] this_knowledge_data
|
498
|
+
#
|
499
|
+
# @yield param found
|
500
|
+
# @yield return score
|
501
|
+
def prepare_property_to_knowledge(property, this_knowledge_data)
|
502
|
+
property = property.to_s
|
503
|
+
|
504
|
+
this_knowledge_data ||= []
|
505
|
+
found = this_knowledge_data.find { |data| data[:predicates].include?(property) }
|
506
|
+
|
507
|
+
if found.nil? || found.empty?
|
508
|
+
# add new
|
509
|
+
|
510
|
+
identical_properties = @predicates_similarity.find_identical(property)
|
511
|
+
|
512
|
+
found = {
|
513
|
+
counter: 0,
|
514
|
+
score: 0.0,
|
515
|
+
predicates: identical_properties || [property.to_s]
|
516
|
+
}
|
517
|
+
|
518
|
+
this_knowledge_data << found
|
519
|
+
end
|
520
|
+
|
521
|
+
new_score = yield found
|
522
|
+
|
523
|
+
|
524
|
+
found[:score] = new_score
|
525
|
+
end
|
526
|
+
|
527
|
+
###
|
528
|
+
# The method delete all resources that already has created result file
|
529
|
+
#
|
530
|
+
# @param [Hash{resource=>type}] resources
|
531
|
+
def keep_unloaded(resources)
|
532
|
+
resources.delete_if { |resource, values|
|
533
|
+
dir_path = "#{@results_dir_path}/#{values[:type]}"
|
534
|
+
resource_name = resource.split('/').last
|
535
|
+
File.exists?("#{dir_path}/#{StringHelper.get_clear_file_path(resource_name)}.json")
|
536
|
+
}
|
537
|
+
end
|
538
|
+
|
539
|
+
###
|
540
|
+
# The method keep all resources that already has created result file
|
541
|
+
#
|
542
|
+
# @param [Hash{resource=>type}] resources
|
543
|
+
def keep_loaded(resources)
|
544
|
+
resources.keep_if { |resource, values|
|
545
|
+
dir_path = "#{@results_dir_path}/#{values[:type]}"
|
546
|
+
resource_name = resource.split('/').last
|
547
|
+
File.exists?("#{dir_path}/#{StringHelper.get_clear_file_path(resource_name)}.json")
|
548
|
+
}
|
549
|
+
end
|
550
|
+
|
551
|
+
###
|
552
|
+
# The method allow to update knowledge base by every entity class type.
|
553
|
+
#
|
554
|
+
# @param [Hash] new_data
|
555
|
+
def update_knowledge_base(new_data)
|
556
|
+
path = "#{@results_dir_path}/knowledge_base.json"
|
557
|
+
old_data = ensure_load_json(path, {}, symbolize_names: true)
|
558
|
+
File.write(path, JSON.pretty_generate(old_data.merge(new_data)))
|
559
|
+
end
|
560
|
+
|
561
|
+
###
|
562
|
+
# The method allow to update global statistic by every entity class type.
|
563
|
+
#
|
564
|
+
# @param [Hash] new_data
|
565
|
+
def update_global_statistic(new_data)
|
566
|
+
path = "#{@results_dir_path}/global_statistic.json"
|
567
|
+
old_data = ensure_load_json(path, {}, symbolize_names: true)
|
568
|
+
File.write(path, JSON.pretty_generate(old_data.merge(new_data)))
|
569
|
+
end
|
570
|
+
|
571
|
+
###
|
572
|
+
# The method returns global properties for given entity class type.
|
573
|
+
#
|
574
|
+
# @param [String] type Type from http://mappings.dbpedia.org/server/ontology/classes/
|
575
|
+
#
|
576
|
+
# @return [Hash] global_statistic_by_type
|
577
|
+
def get_global_statistic_by_type(type)
|
578
|
+
type = type.to_s.to_sym
|
579
|
+
path = "#{@results_dir_path}/global_statistic.json"
|
580
|
+
data = ensure_load_json(path, {}, symbolize_names: true)
|
581
|
+
data[type]
|
582
|
+
end
|
583
|
+
|
584
|
+
###
|
585
|
+
# The method helps to load json file.
|
586
|
+
#
|
587
|
+
# @param [String] file_path
|
588
|
+
# @param [String] def_val If no exist file add values as default.
|
589
|
+
# @param [Hash] json_params JSON.parse params
|
590
|
+
#
|
591
|
+
# @return [Object] json
|
592
|
+
def ensure_load_json(file_path, def_val, json_params = {})
|
593
|
+
if File.exists?(file_path)
|
594
|
+
file_data = File.read(file_path).force_encoding('utf-8')
|
595
|
+
if file_data.size >= 2 # '[]'
|
596
|
+
JSON.parse(file_data, json_params)
|
597
|
+
else
|
598
|
+
def_val
|
599
|
+
end
|
600
|
+
else
|
601
|
+
def_val
|
602
|
+
end
|
603
|
+
end
|
604
|
+
|
605
|
+
|
606
|
+
end
|
607
|
+
end
|
608
|
+
|
609
|
+
end
|