rdf-normalize 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/lib/rdf/normalize/base.rb +8 -1
- data/lib/rdf/normalize/carroll2001.rb +1 -3
- data/lib/rdf/normalize/rdfc10.rb +64 -25
- data/lib/rdf/normalize/urgna2012.rb +7 -1
- data/lib/rdf/normalize.rb +4 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7b8d7e930eb7f452fef42bd6f66b29dfcd3f526f7c9dafebbfe214ed5dfa4007
|
4
|
+
data.tar.gz: c4c60292f9868d39d50545dca77c8d5afd85885b0a1fc690b0ce84d2ee00ddd7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05dd3390670479211a348c4fbd91e0c369355f374d72d2834697641ab64b3cce438b4df8ee63f6db50af1d68e036fa747762474448e6d326d5bfa758ca120b3d
|
7
|
+
data.tar.gz: 6189376e59d897e1e6d4b440b6ed75f0e7de970d6cbacb6b21e7e75976e7da0b6e6899085e2972eabf307556d9b17eb5277509fde3096e3ab428a0e84758d9c8
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.6.
|
1
|
+
0.6.1
|
data/lib/rdf/normalize/base.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module RDF::Normalize
|
2
2
|
##
|
3
3
|
# Abstract class for pluggable normalization algorithms. Delegates to a default or selected algorithm if instantiated
|
4
|
-
|
4
|
+
class Base
|
5
5
|
attr_reader :dataset
|
6
6
|
|
7
7
|
# Enumerates normalized statements
|
@@ -11,5 +11,12 @@ module RDF::Normalize
|
|
11
11
|
def each(&block)
|
12
12
|
raise "Not Implemented"
|
13
13
|
end
|
14
|
+
|
15
|
+
# Returns a map from input blank node identifiers to canonical blank node identifiers.
|
16
|
+
#
|
17
|
+
# @return [Hash{String => String}]
|
18
|
+
def to_hash
|
19
|
+
raise "Not Implemented"
|
20
|
+
end
|
14
21
|
end
|
15
22
|
end
|
data/lib/rdf/normalize/rdfc10.rb
CHANGED
@@ -6,27 +6,43 @@ rescue LoadError
|
|
6
6
|
end
|
7
7
|
|
8
8
|
module RDF::Normalize
|
9
|
-
class RDFC10
|
9
|
+
class RDFC10 < Base
|
10
10
|
include RDF::Enumerable
|
11
11
|
include RDF::Util::Logger
|
12
|
-
include Base
|
13
12
|
|
14
13
|
##
|
15
14
|
# Create an enumerable with grounded nodes
|
16
15
|
#
|
17
16
|
# @param [RDF::Enumerable] enumerable
|
17
|
+
# @option options [Integer] :max_calls (40)
|
18
|
+
# Maximum number of calls allowed for recursive blank node labeling,
|
19
|
+
# as a multiple of the total number of blank nodes in the dataset.
|
18
20
|
# @return [RDF::Enumerable]
|
21
|
+
# raise [RuntimeError] if the maximum number of levels of recursion is exceeded.
|
19
22
|
def initialize(enumerable, **options)
|
20
23
|
@dataset, @options = enumerable, options
|
21
24
|
end
|
22
25
|
|
26
|
+
# Yields each normalized statement
|
23
27
|
def each(&block)
|
24
|
-
ns = NormalizationState.new(
|
28
|
+
ns = NormalizationState.new(**@options)
|
25
29
|
log_debug("ca:")
|
26
30
|
log_debug(" log point", "Entering the canonicalization function (4.5.3).")
|
27
31
|
log_depth(depth: 2) {normalize_statements(ns, &block)}
|
28
32
|
end
|
29
33
|
|
34
|
+
# Returns a map from input blank node identifiers to canonical blank node identifiers.
|
35
|
+
#
|
36
|
+
# @return [Hash{String => String}]
|
37
|
+
def to_hash
|
38
|
+
ns = NormalizationState.new(**@options)
|
39
|
+
log_debug("ca:")
|
40
|
+
log_debug(" log point", "Entering the canonicalization function (4.5.3).")
|
41
|
+
log_depth(depth: 2) {normalize_statements(ns)}
|
42
|
+
ns.canonical_issuer.to_hash
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
30
46
|
protected
|
31
47
|
def normalize_statements(ns, &block)
|
32
48
|
# Step 2: Map BNodes to the statements they are used by
|
@@ -79,6 +95,11 @@ module RDF::Normalize
|
|
79
95
|
log_debug("ca.5:") unless ns.hash_to_bnodes.empty?
|
80
96
|
log_debug(" log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5)).")
|
81
97
|
log_debug(" with:") unless ns.hash_to_bnodes.empty?
|
98
|
+
|
99
|
+
# Initialize the number of calls allowed to hash_n_degree_quads
|
100
|
+
# as a multiple of the total number of blank nodes in the dataset.
|
101
|
+
ns.max_calls = ns.bnode_to_statements.keys.length * @options.fetch(:max_calls, 40)
|
102
|
+
|
82
103
|
ns.hash_to_bnodes.keys.sort.each do |hash|
|
83
104
|
identifier_list = ns.hash_to_bnodes[hash]
|
84
105
|
|
@@ -105,27 +126,29 @@ module RDF::Normalize
|
|
105
126
|
hash_path_list.sort_by(&:first).each do |result, issuer|
|
106
127
|
issuer.issued.each do |node|
|
107
128
|
id = ns.canonical_issuer.issue_identifier(node)
|
108
|
-
log_debug("
|
109
|
-
log_debug("
|
129
|
+
log_debug(" - blank node") {node.id}
|
130
|
+
log_debug(" canonical identifier", id)
|
110
131
|
end
|
111
132
|
end
|
112
133
|
end
|
113
134
|
|
114
135
|
# Step 6: Yield statements using BNodes from canonical replacements
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
136
|
+
if block_given?
|
137
|
+
dataset.each_statement do |statement|
|
138
|
+
if statement.has_blank_nodes?
|
139
|
+
quad = statement.to_quad.compact.map do |term|
|
140
|
+
term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
|
141
|
+
end
|
142
|
+
block.call RDF::Statement.from(quad)
|
143
|
+
else
|
144
|
+
block.call statement
|
119
145
|
end
|
120
|
-
block.call RDF::Statement.from(quad)
|
121
|
-
else
|
122
|
-
block.call statement
|
123
146
|
end
|
124
147
|
end
|
125
148
|
|
126
149
|
log_debug("ca.6:")
|
127
|
-
log_debug(" log point", "
|
128
|
-
log_debug("
|
150
|
+
log_debug(" log point", "Issued identifiers map (4.4.3 (6)).")
|
151
|
+
log_debug(" issued identifiers map: #{ns.canonical_issuer.inspect}")
|
129
152
|
dataset
|
130
153
|
end
|
131
154
|
|
@@ -137,10 +160,13 @@ module RDF::Normalize
|
|
137
160
|
attr_accessor :bnode_to_statements
|
138
161
|
attr_accessor :hash_to_bnodes
|
139
162
|
attr_accessor :canonical_issuer
|
163
|
+
attr_accessor :max_calls
|
164
|
+
attr_accessor :total_calls
|
140
165
|
|
141
|
-
def initialize(options)
|
166
|
+
def initialize(**options)
|
142
167
|
@options = options
|
143
168
|
@bnode_to_statements, @hash_to_bnodes, @canonical_issuer = {}, {}, IdentifierIssuer.new("c14n")
|
169
|
+
@max_calls, @total_calls = nil, 0
|
144
170
|
end
|
145
171
|
|
146
172
|
def add_statement(node, statement)
|
@@ -204,34 +230,40 @@ module RDF::Normalize
|
|
204
230
|
hexdigest(input)
|
205
231
|
end
|
206
232
|
|
207
|
-
# @param [RDF::Node]
|
233
|
+
# @param [RDF::Node] node
|
208
234
|
# @param [IdentifierIssuer] issuer
|
209
235
|
# @return [Array<String,IdentifierIssuer>] the Hash and issuer
|
210
|
-
|
236
|
+
# @raise [RuntimeError] If total number of calls has exceeded `max_calls` times the number of blank nodes in the dataset.
|
237
|
+
def hash_n_degree_quads(node, issuer)
|
211
238
|
log_debug("hndq:")
|
212
239
|
log_debug(" log point", "Hash N-Degree Quads function (4.9.3).")
|
213
|
-
log_debug(" identifier") {
|
240
|
+
log_debug(" identifier") {node.id}
|
214
241
|
log_debug(" issuer") {issuer.inspect}
|
215
242
|
|
243
|
+
if max_calls && total_calls >= max_calls
|
244
|
+
raise "Exceeded maximum number of calls (#{total_calls}) allowed to hash_n_degree_quads"
|
245
|
+
end
|
246
|
+
@total_calls += 1
|
247
|
+
|
216
248
|
# hash to related blank nodes map
|
217
249
|
hn = {}
|
218
250
|
|
219
251
|
log_debug(" hndq.2:")
|
220
252
|
log_debug(" log point", "Quads for identifier (4.9.3 (2)).")
|
221
253
|
log_debug(" quads:")
|
222
|
-
bnode_to_statements[
|
254
|
+
bnode_to_statements[node].each do |s|
|
223
255
|
log_debug {" - #{s.to_nquads.strip}"}
|
224
256
|
end
|
225
257
|
|
226
258
|
# Step 3
|
227
259
|
log_debug(" hndq.3:")
|
228
260
|
log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (3)).")
|
229
|
-
log_debug(" with:") unless bnode_to_statements[
|
230
|
-
bnode_to_statements[
|
261
|
+
log_debug(" with:") unless bnode_to_statements[node].empty?
|
262
|
+
bnode_to_statements[node].each do |statement|
|
231
263
|
log_debug {" - quad: #{statement.to_nquads.strip}"}
|
232
264
|
log_debug(" hndq.3.1:")
|
233
265
|
log_debug(" log point", "Hash related bnode component (4.9.3 (3.1))")
|
234
|
-
log_depth(depth: 10) {hash_related_statement(
|
266
|
+
log_depth(depth: 10) {hash_related_statement(node, statement, issuer, hn)}
|
235
267
|
end
|
236
268
|
log_debug(" Hash to bnodes:")
|
237
269
|
hn.each do |k,v|
|
@@ -286,7 +318,9 @@ module RDF::Normalize
|
|
286
318
|
log_debug(" with:") unless recursion_list.empty?
|
287
319
|
recursion_list.each do |related|
|
288
320
|
log_debug(" - related") {related.id}
|
289
|
-
result = log_depth(depth: 18)
|
321
|
+
result = log_depth(depth: 18) do
|
322
|
+
hash_n_degree_quads(related, issuer_copy)
|
323
|
+
end
|
290
324
|
path << '_:' + issuer_copy.issue_identifier(related)
|
291
325
|
path << "<#{result.first}>"
|
292
326
|
issuer_copy = result.last
|
@@ -337,10 +371,10 @@ module RDF::Normalize
|
|
337
371
|
end
|
338
372
|
|
339
373
|
# Group adjacent bnodes by hash
|
340
|
-
def hash_related_statement(
|
374
|
+
def hash_related_statement(node, statement, issuer, map)
|
341
375
|
log_debug("with:") if statement.to_h.values.any? {|t| t.is_a?(RDF::Node)}
|
342
376
|
statement.to_h(:s, :p, :o, :g).each do |pos, term|
|
343
|
-
next if !term.is_a?(RDF::Node) || term ==
|
377
|
+
next if !term.is_a?(RDF::Node) || term == node
|
344
378
|
|
345
379
|
log_debug(" - position", pos)
|
346
380
|
hash = log_depth(depth: 4) {hash_related_node(term, statement, issuer, pos)}
|
@@ -374,6 +408,11 @@ module RDF::Normalize
|
|
374
408
|
@issued[node]
|
375
409
|
end
|
376
410
|
|
411
|
+
# @return [Hash{Symbol => Symbol}] the issued identifiers map
|
412
|
+
def to_hash
|
413
|
+
@issued.inject({}) {|memo, (node, canon)| memo.merge(node.id => canon)}
|
414
|
+
end
|
415
|
+
|
377
416
|
# Duplicate this issuer, ensuring that the issued identifiers remain distinct
|
378
417
|
# @return [IdentifierIssuer]
|
379
418
|
def dup
|
@@ -2,10 +2,16 @@ module RDF::Normalize
|
|
2
2
|
class URGNA2012 < RDFC10
|
3
3
|
|
4
4
|
def each(&block)
|
5
|
-
ns = NormalizationState.new(
|
5
|
+
ns = NormalizationState.new(**@options)
|
6
6
|
normalize_statements(ns, &block)
|
7
7
|
end
|
8
8
|
|
9
|
+
def to_hash
|
10
|
+
ns = NormalizationState.new(**@options)
|
11
|
+
normalize_statements(ns)
|
12
|
+
ns.canonical_issuer.to_h
|
13
|
+
end
|
14
|
+
|
9
15
|
class NormalizationState < RDFC10::NormalizationState
|
10
16
|
protected
|
11
17
|
|
data/lib/rdf/normalize.rb
CHANGED
@@ -52,6 +52,10 @@ module RDF
|
|
52
52
|
# @param [Hash{Symbol => Object}] options
|
53
53
|
# @option options [Base] :algorithm (:rdfc10)
|
54
54
|
# One of `:carroll2001`, `:urgna2012`, or `:rdfc10`
|
55
|
+
# @option options [Integer] :max_calls
|
56
|
+
# Maximum number of calls allowed for recursive blank node labeling,
|
57
|
+
# as a multiple of the total number of blank nodes in the dataset.
|
58
|
+
# @option options [Boolean] :identifier_map
|
55
59
|
# @return [RDF::Normalize::Base]
|
56
60
|
# @raise [ArgumentError] selected algorithm not defined
|
57
61
|
def new(enumerable, **options)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rdf-normalize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregg Kellogg
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rdf
|