rdf-normalize 0.6.0 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/lib/rdf/normalize/base.rb +8 -1
- data/lib/rdf/normalize/carroll2001.rb +1 -3
- data/lib/rdf/normalize/rdfc10.rb +64 -25
- data/lib/rdf/normalize/urgna2012.rb +7 -1
- data/lib/rdf/normalize.rb +4 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7b8d7e930eb7f452fef42bd6f66b29dfcd3f526f7c9dafebbfe214ed5dfa4007
|
4
|
+
data.tar.gz: c4c60292f9868d39d50545dca77c8d5afd85885b0a1fc690b0ce84d2ee00ddd7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05dd3390670479211a348c4fbd91e0c369355f374d72d2834697641ab64b3cce438b4df8ee63f6db50af1d68e036fa747762474448e6d326d5bfa758ca120b3d
|
7
|
+
data.tar.gz: 6189376e59d897e1e6d4b440b6ed75f0e7de970d6cbacb6b21e7e75976e7da0b6e6899085e2972eabf307556d9b17eb5277509fde3096e3ab428a0e84758d9c8
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.6.
|
1
|
+
0.6.1
|
data/lib/rdf/normalize/base.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module RDF::Normalize
|
2
2
|
##
|
3
3
|
# Abstract class for pluggable normalization algorithms. Delegates to a default or selected algorithm if instantiated
|
4
|
-
|
4
|
+
class Base
|
5
5
|
attr_reader :dataset
|
6
6
|
|
7
7
|
# Enumerates normalized statements
|
@@ -11,5 +11,12 @@ module RDF::Normalize
|
|
11
11
|
def each(&block)
|
12
12
|
raise "Not Implemented"
|
13
13
|
end
|
14
|
+
|
15
|
+
# Returns a map from input blank node identifiers to canonical blank node identifiers.
|
16
|
+
#
|
17
|
+
# @return [Hash{String => String}]
|
18
|
+
def to_hash
|
19
|
+
raise "Not Implemented"
|
20
|
+
end
|
14
21
|
end
|
15
22
|
end
|
data/lib/rdf/normalize/rdfc10.rb
CHANGED
@@ -6,27 +6,43 @@ rescue LoadError
|
|
6
6
|
end
|
7
7
|
|
8
8
|
module RDF::Normalize
|
9
|
-
class RDFC10
|
9
|
+
class RDFC10 < Base
|
10
10
|
include RDF::Enumerable
|
11
11
|
include RDF::Util::Logger
|
12
|
-
include Base
|
13
12
|
|
14
13
|
##
|
15
14
|
# Create an enumerable with grounded nodes
|
16
15
|
#
|
17
16
|
# @param [RDF::Enumerable] enumerable
|
17
|
+
# @option options [Integer] :max_calls (40)
|
18
|
+
# Maximum number of calls allowed for recursive blank node labeling,
|
19
|
+
# as a multiple of the total number of blank nodes in the dataset.
|
18
20
|
# @return [RDF::Enumerable]
|
21
|
+
# raise [RuntimeError] if the maximum number of levels of recursion is exceeded.
|
19
22
|
def initialize(enumerable, **options)
|
20
23
|
@dataset, @options = enumerable, options
|
21
24
|
end
|
22
25
|
|
26
|
+
# Yields each normalized statement
|
23
27
|
def each(&block)
|
24
|
-
ns = NormalizationState.new(
|
28
|
+
ns = NormalizationState.new(**@options)
|
25
29
|
log_debug("ca:")
|
26
30
|
log_debug(" log point", "Entering the canonicalization function (4.5.3).")
|
27
31
|
log_depth(depth: 2) {normalize_statements(ns, &block)}
|
28
32
|
end
|
29
33
|
|
34
|
+
# Returns a map from input blank node identifiers to canonical blank node identifiers.
|
35
|
+
#
|
36
|
+
# @return [Hash{String => String}]
|
37
|
+
def to_hash
|
38
|
+
ns = NormalizationState.new(**@options)
|
39
|
+
log_debug("ca:")
|
40
|
+
log_debug(" log point", "Entering the canonicalization function (4.5.3).")
|
41
|
+
log_depth(depth: 2) {normalize_statements(ns)}
|
42
|
+
ns.canonical_issuer.to_hash
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
30
46
|
protected
|
31
47
|
def normalize_statements(ns, &block)
|
32
48
|
# Step 2: Map BNodes to the statements they are used by
|
@@ -79,6 +95,11 @@ module RDF::Normalize
|
|
79
95
|
log_debug("ca.5:") unless ns.hash_to_bnodes.empty?
|
80
96
|
log_debug(" log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5)).")
|
81
97
|
log_debug(" with:") unless ns.hash_to_bnodes.empty?
|
98
|
+
|
99
|
+
# Initialize the number of calls allowed to hash_n_degree_quads
|
100
|
+
# as a multiple of the total number of blank nodes in the dataset.
|
101
|
+
ns.max_calls = ns.bnode_to_statements.keys.length * @options.fetch(:max_calls, 40)
|
102
|
+
|
82
103
|
ns.hash_to_bnodes.keys.sort.each do |hash|
|
83
104
|
identifier_list = ns.hash_to_bnodes[hash]
|
84
105
|
|
@@ -105,27 +126,29 @@ module RDF::Normalize
|
|
105
126
|
hash_path_list.sort_by(&:first).each do |result, issuer|
|
106
127
|
issuer.issued.each do |node|
|
107
128
|
id = ns.canonical_issuer.issue_identifier(node)
|
108
|
-
log_debug("
|
109
|
-
log_debug("
|
129
|
+
log_debug(" - blank node") {node.id}
|
130
|
+
log_debug(" canonical identifier", id)
|
110
131
|
end
|
111
132
|
end
|
112
133
|
end
|
113
134
|
|
114
135
|
# Step 6: Yield statements using BNodes from canonical replacements
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
136
|
+
if block_given?
|
137
|
+
dataset.each_statement do |statement|
|
138
|
+
if statement.has_blank_nodes?
|
139
|
+
quad = statement.to_quad.compact.map do |term|
|
140
|
+
term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
|
141
|
+
end
|
142
|
+
block.call RDF::Statement.from(quad)
|
143
|
+
else
|
144
|
+
block.call statement
|
119
145
|
end
|
120
|
-
block.call RDF::Statement.from(quad)
|
121
|
-
else
|
122
|
-
block.call statement
|
123
146
|
end
|
124
147
|
end
|
125
148
|
|
126
149
|
log_debug("ca.6:")
|
127
|
-
log_debug(" log point", "
|
128
|
-
log_debug("
|
150
|
+
log_debug(" log point", "Issued identifiers map (4.4.3 (6)).")
|
151
|
+
log_debug(" issued identifiers map: #{ns.canonical_issuer.inspect}")
|
129
152
|
dataset
|
130
153
|
end
|
131
154
|
|
@@ -137,10 +160,13 @@ module RDF::Normalize
|
|
137
160
|
attr_accessor :bnode_to_statements
|
138
161
|
attr_accessor :hash_to_bnodes
|
139
162
|
attr_accessor :canonical_issuer
|
163
|
+
attr_accessor :max_calls
|
164
|
+
attr_accessor :total_calls
|
140
165
|
|
141
|
-
def initialize(options)
|
166
|
+
def initialize(**options)
|
142
167
|
@options = options
|
143
168
|
@bnode_to_statements, @hash_to_bnodes, @canonical_issuer = {}, {}, IdentifierIssuer.new("c14n")
|
169
|
+
@max_calls, @total_calls = nil, 0
|
144
170
|
end
|
145
171
|
|
146
172
|
def add_statement(node, statement)
|
@@ -204,34 +230,40 @@ module RDF::Normalize
|
|
204
230
|
hexdigest(input)
|
205
231
|
end
|
206
232
|
|
207
|
-
# @param [RDF::Node]
|
233
|
+
# @param [RDF::Node] node
|
208
234
|
# @param [IdentifierIssuer] issuer
|
209
235
|
# @return [Array<String,IdentifierIssuer>] the Hash and issuer
|
210
|
-
|
236
|
+
# @raise [RuntimeError] If total number of calls has exceeded `max_calls` times the number of blank nodes in the dataset.
|
237
|
+
def hash_n_degree_quads(node, issuer)
|
211
238
|
log_debug("hndq:")
|
212
239
|
log_debug(" log point", "Hash N-Degree Quads function (4.9.3).")
|
213
|
-
log_debug(" identifier") {
|
240
|
+
log_debug(" identifier") {node.id}
|
214
241
|
log_debug(" issuer") {issuer.inspect}
|
215
242
|
|
243
|
+
if max_calls && total_calls >= max_calls
|
244
|
+
raise "Exceeded maximum number of calls (#{total_calls}) allowed to hash_n_degree_quads"
|
245
|
+
end
|
246
|
+
@total_calls += 1
|
247
|
+
|
216
248
|
# hash to related blank nodes map
|
217
249
|
hn = {}
|
218
250
|
|
219
251
|
log_debug(" hndq.2:")
|
220
252
|
log_debug(" log point", "Quads for identifier (4.9.3 (2)).")
|
221
253
|
log_debug(" quads:")
|
222
|
-
bnode_to_statements[
|
254
|
+
bnode_to_statements[node].each do |s|
|
223
255
|
log_debug {" - #{s.to_nquads.strip}"}
|
224
256
|
end
|
225
257
|
|
226
258
|
# Step 3
|
227
259
|
log_debug(" hndq.3:")
|
228
260
|
log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (3)).")
|
229
|
-
log_debug(" with:") unless bnode_to_statements[
|
230
|
-
bnode_to_statements[
|
261
|
+
log_debug(" with:") unless bnode_to_statements[node].empty?
|
262
|
+
bnode_to_statements[node].each do |statement|
|
231
263
|
log_debug {" - quad: #{statement.to_nquads.strip}"}
|
232
264
|
log_debug(" hndq.3.1:")
|
233
265
|
log_debug(" log point", "Hash related bnode component (4.9.3 (3.1))")
|
234
|
-
log_depth(depth: 10) {hash_related_statement(
|
266
|
+
log_depth(depth: 10) {hash_related_statement(node, statement, issuer, hn)}
|
235
267
|
end
|
236
268
|
log_debug(" Hash to bnodes:")
|
237
269
|
hn.each do |k,v|
|
@@ -286,7 +318,9 @@ module RDF::Normalize
|
|
286
318
|
log_debug(" with:") unless recursion_list.empty?
|
287
319
|
recursion_list.each do |related|
|
288
320
|
log_debug(" - related") {related.id}
|
289
|
-
result = log_depth(depth: 18)
|
321
|
+
result = log_depth(depth: 18) do
|
322
|
+
hash_n_degree_quads(related, issuer_copy)
|
323
|
+
end
|
290
324
|
path << '_:' + issuer_copy.issue_identifier(related)
|
291
325
|
path << "<#{result.first}>"
|
292
326
|
issuer_copy = result.last
|
@@ -337,10 +371,10 @@ module RDF::Normalize
|
|
337
371
|
end
|
338
372
|
|
339
373
|
# Group adjacent bnodes by hash
|
340
|
-
def hash_related_statement(
|
374
|
+
def hash_related_statement(node, statement, issuer, map)
|
341
375
|
log_debug("with:") if statement.to_h.values.any? {|t| t.is_a?(RDF::Node)}
|
342
376
|
statement.to_h(:s, :p, :o, :g).each do |pos, term|
|
343
|
-
next if !term.is_a?(RDF::Node) || term ==
|
377
|
+
next if !term.is_a?(RDF::Node) || term == node
|
344
378
|
|
345
379
|
log_debug(" - position", pos)
|
346
380
|
hash = log_depth(depth: 4) {hash_related_node(term, statement, issuer, pos)}
|
@@ -374,6 +408,11 @@ module RDF::Normalize
|
|
374
408
|
@issued[node]
|
375
409
|
end
|
376
410
|
|
411
|
+
# @return [Hash{Symbol => Symbol}] the issued identifiers map
|
412
|
+
def to_hash
|
413
|
+
@issued.inject({}) {|memo, (node, canon)| memo.merge(node.id => canon)}
|
414
|
+
end
|
415
|
+
|
377
416
|
# Duplicate this issuer, ensuring that the issued identifiers remain distinct
|
378
417
|
# @return [IdentifierIssuer]
|
379
418
|
def dup
|
@@ -2,10 +2,16 @@ module RDF::Normalize
|
|
2
2
|
class URGNA2012 < RDFC10
|
3
3
|
|
4
4
|
def each(&block)
|
5
|
-
ns = NormalizationState.new(
|
5
|
+
ns = NormalizationState.new(**@options)
|
6
6
|
normalize_statements(ns, &block)
|
7
7
|
end
|
8
8
|
|
9
|
+
def to_hash
|
10
|
+
ns = NormalizationState.new(**@options)
|
11
|
+
normalize_statements(ns)
|
12
|
+
ns.canonical_issuer.to_h
|
13
|
+
end
|
14
|
+
|
9
15
|
class NormalizationState < RDFC10::NormalizationState
|
10
16
|
protected
|
11
17
|
|
data/lib/rdf/normalize.rb
CHANGED
@@ -52,6 +52,10 @@ module RDF
|
|
52
52
|
# @param [Hash{Symbol => Object}] options
|
53
53
|
# @option options [Base] :algorithm (:rdfc10)
|
54
54
|
# One of `:carroll2001`, `:urgna2012`, or `:rdfc10`
|
55
|
+
# @option options [Integer] :max_calls
|
56
|
+
# Maximum number of calls allowed for recursive blank node labeling,
|
57
|
+
# as a multiple of the total number of blank nodes in the dataset.
|
58
|
+
# @option options [Boolean] :identifier_map
|
55
59
|
# @return [RDF::Normalize::Base]
|
56
60
|
# @raise [ArgumentError] selected algorithm not defined
|
57
61
|
def new(enumerable, **options)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rdf-normalize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregg Kellogg
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rdf
|