rdf-normalize 0.5.1 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 75218bd6e68ada2e64c27fb691f0ff8b92cfd8b9fe9747f02d862bcbcdd8a5dd
4
- data.tar.gz: fbe6c3579b66435b0b2620178a3ebf46e1bd56f17180a7a64f081aab65a3c447
3
+ metadata.gz: 7b8d7e930eb7f452fef42bd6f66b29dfcd3f526f7c9dafebbfe214ed5dfa4007
4
+ data.tar.gz: c4c60292f9868d39d50545dca77c8d5afd85885b0a1fc690b0ce84d2ee00ddd7
5
5
  SHA512:
6
- metadata.gz: 1363fb834466a6a643245a12776ab6c49cff62436af19ecd5d6dfe58feb83b49971868838dc1d9f687a997ee93af2b12b98a51eedc2f6846e1e0e74810eb4285
7
- data.tar.gz: 62fb2457433083c9b4181353bfa0ec8740a871f3f58b6ac83233a0dd6385b3556a7540c62b1041bab5ba2aee17a55234c70a323fedfc41db09912bf7d3675215
6
+ metadata.gz: 05dd3390670479211a348c4fbd91e0c369355f374d72d2834697641ab64b3cce438b4df8ee63f6db50af1d68e036fa747762474448e6d326d5bfa758ca120b3d
7
+ data.tar.gz: 6189376e59d897e1e6d4b440b6ed75f0e7de970d6cbacb6b21e7e75976e7da0b6e6899085e2972eabf307556d9b17eb5277509fde3096e3ab428a0e84758d9c8
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # RDF::Normalize
2
2
  RDF Graph normalizer for [RDF.rb][RDF.rb].
3
3
 
4
- [![Gem Version](https://badge.fury.io/rb/rdf-normalize.png)](https://badge.fury.io/rb/rdf-normalize)
4
+ [![Gem Version](https://badge.fury.io/rb/rdf-normalize.svg)](https://badge.fury.io/rb/rdf-normalize)
5
5
  [![Build Status](https://github.com/ruby-rdf/rdf-normalize/workflows/CI/badge.svg?branch=develop)](https://github.com/ruby-rdf/rdf-normalize/actions?query=workflow%3ACI)
6
6
  [![Coverage Status](https://coveralls.io/repos/ruby-rdf/rdf-normalize/badge.svg?branch=develop)](https://coveralls.io/github/ruby-rdf/rdf-normalize?branch=develop)
7
7
  [![Gitter chat](https://badges.gitter.im/ruby-rdf/rdf.png)](https://gitter.im/ruby-rdf/rdf)
@@ -17,7 +17,7 @@ to serialize normalized statements.
17
17
  Algorithms implemented:
18
18
 
19
19
  * [URGNA2012](https://www.w3.org/TR/rdf-canon/#dfn-urgna2012)
20
- * [URDNA2015](https://www.w3.org/TR/rdf-canon/#dfn-urdna2015)
20
+ * [RDFC-1.0](https://www.w3.org/TR/rdf-canon/#dfn-rdfc-1-0)
21
21
 
22
22
  Install with `gem install rdf-normalize`
23
23
 
@@ -45,7 +45,7 @@ Full documentation available on [GitHub][Normalize doc]
45
45
  * {RDF::Normalize::Format}
46
46
  * {RDF::Normalize::Writer}
47
47
  * {RDF::Normalize::URGNA2012}
48
- * {RDF::Normalize::URDNA2015}
48
+ * {RDF::Normalize::RDFC10}
49
49
 
50
50
  ## Dependencies
51
51
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.1
1
+ 0.6.1
@@ -1,7 +1,7 @@
1
1
  module RDF::Normalize
2
2
  ##
3
3
  # Abstract class for pluggable normalization algorithms. Delegates to a default or selected algorithm if instantiated
4
- module Base
4
+ class Base
5
5
  attr_reader :dataset
6
6
 
7
7
  # Enumerates normalized statements
@@ -11,5 +11,12 @@ module RDF::Normalize
11
11
  def each(&block)
12
12
  raise "Not Implemented"
13
13
  end
14
+
15
+ # Returns a map from input blank node identifiers to canonical blank node identifiers.
16
+ #
17
+ # @return [Hash{String => String}]
18
+ def to_hash
19
+ raise "Not Implemented"
20
+ end
14
21
  end
15
22
  end
@@ -1,8 +1,6 @@
1
1
  module RDF::Normalize
2
- class Carroll2001
2
+ class Carroll2001 < Base
3
3
  include RDF::Enumerable
4
- include Base
5
- include Utils
6
4
 
7
5
  ##
8
6
  # Create an enumerable with grounded nodes
@@ -0,0 +1,429 @@
1
+ require 'rdf/nquads'
2
+ begin
3
+ require 'json'
4
+ rescue LoadError
5
+ # Used for debug output
6
+ end
7
+
8
+ module RDF::Normalize
9
+ class RDFC10 < Base
10
+ include RDF::Enumerable
11
+ include RDF::Util::Logger
12
+
13
+ ##
14
+ # Create an enumerable with grounded nodes
15
+ #
16
+ # @param [RDF::Enumerable] enumerable
17
+ # @option options [Integer] :max_calls (40)
18
+ # Maximum number of calls allowed for recursive blank node labeling,
19
+ # as a multiple of the total number of blank nodes in the dataset.
20
+ # @return [RDF::Enumerable]
21
+ # raise [RuntimeError] if the maximum number of levels of recursion is exceeded.
22
+ def initialize(enumerable, **options)
23
+ @dataset, @options = enumerable, options
24
+ end
25
+
26
+ # Yields each normalized statement
27
+ def each(&block)
28
+ ns = NormalizationState.new(**@options)
29
+ log_debug("ca:")
30
+ log_debug(" log point", "Entering the canonicalization function (4.5.3).")
31
+ log_depth(depth: 2) {normalize_statements(ns, &block)}
32
+ end
33
+
34
+ # Returns a map from input blank node identifiers to canonical blank node identifiers.
35
+ #
36
+ # @return [Hash{String => String}]
37
+ def to_hash
38
+ ns = NormalizationState.new(**@options)
39
+ log_debug("ca:")
40
+ log_debug(" log point", "Entering the canonicalization function (4.5.3).")
41
+ log_depth(depth: 2) {normalize_statements(ns)}
42
+ ns.canonical_issuer.to_hash
43
+ end
44
+
45
+ #
46
+ protected
47
+ def normalize_statements(ns, &block)
48
+ # Step 2: Map BNodes to the statements they are used by
49
+ dataset.each_statement do |statement|
50
+ statement.to_quad.compact.select(&:node?).each do |node|
51
+ ns.add_statement(node, statement)
52
+ end
53
+ end
54
+ log_debug("ca.2:")
55
+ log_debug(" log point", "Extract quads for each bnode (4.5.3 (2)).")
56
+ log_debug(" Bnode to quads:")
57
+ if logger && logger.level == 0
58
+ ns.bnode_to_statements.each do |bn, statements|
59
+ log_debug(" #{bn.id}:")
60
+ statements.each do |s|
61
+ log_debug {" - #{s.to_nquads.strip}"}
62
+ end
63
+ end
64
+ end
65
+
66
+ ns.hash_to_bnodes = {}
67
+
68
+ # Step 3: Calculate hashes for first degree nodes
69
+ log_debug("ca.3:")
70
+ log_debug(" log point", "Calculated first degree hashes (4.5.3 (3)).")
71
+ log_debug(" with:")
72
+ ns.bnode_to_statements.each_key do |node|
73
+ log_debug(" - identifier") {node.id}
74
+ log_debug(" h1dq:")
75
+ hash = log_depth(depth: 8) {ns.hash_first_degree_quads(node)}
76
+ ns.add_bnode_hash(node, hash)
77
+ end
78
+
79
+ # Step 4: Create canonical replacements for hashes mapping to a single node
80
+ log_debug("ca.4:")
81
+ log_debug(" log point", "Create canonical replacements for hashes mapping to a single node (4.5.3 (4)).")
82
+ log_debug(" with:") unless ns.hash_to_bnodes.empty?
83
+ ns.hash_to_bnodes.keys.sort.each do |hash|
84
+ identifier_list = ns.hash_to_bnodes[hash]
85
+ next if identifier_list.length > 1
86
+ node = identifier_list.first
87
+ id = ns.canonical_issuer.issue_identifier(node)
88
+ log_debug(" - identifier") {node.id}
89
+ log_debug(" hash", hash)
90
+ log_debug(" canonical label", id)
91
+ ns.hash_to_bnodes.delete(hash)
92
+ end
93
+
94
+ # Step 5: Iterate over hashs having more than one node
95
+ log_debug("ca.5:") unless ns.hash_to_bnodes.empty?
96
+ log_debug(" log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5)).")
97
+ log_debug(" with:") unless ns.hash_to_bnodes.empty?
98
+
99
+ # Initialize the number of calls allowed to hash_n_degree_quads
100
+ # as a multiple of the total number of blank nodes in the dataset.
101
+ ns.max_calls = ns.bnode_to_statements.keys.length * @options.fetch(:max_calls, 40)
102
+
103
+ ns.hash_to_bnodes.keys.sort.each do |hash|
104
+ identifier_list = ns.hash_to_bnodes[hash]
105
+
106
+ log_debug(" - hash", hash)
107
+ log_debug(" identifier list") {identifier_list.map(&:id).to_json(indent: ' ')}
108
+ hash_path_list = []
109
+
110
+ # Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements
111
+ log_debug(" ca.5.2:")
112
+ log_debug(" log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5.2)).")
113
+ log_debug(" with:") unless identifier_list.empty?
114
+ identifier_list.each do |identifier|
115
+ next if ns.canonical_issuer.issued.include?(identifier)
116
+ temporary_issuer = IdentifierIssuer.new("b")
117
+ temporary_issuer.issue_identifier(identifier)
118
+ log_debug(" - identifier") {identifier.id}
119
+ hash_path_list << log_depth(depth: 12) {ns.hash_n_degree_quads(identifier, temporary_issuer)}
120
+ end
121
+
122
+ # Create canonical replacements for nodes
123
+ log_debug(" ca.5.3:") unless hash_path_list.empty?
124
+ log_debug(" log point", "Canonical identifiers for temporary identifiers (4.5.3 (5.3)).")
125
+ log_debug(" issuer:") unless hash_path_list.empty?
126
+ hash_path_list.sort_by(&:first).each do |result, issuer|
127
+ issuer.issued.each do |node|
128
+ id = ns.canonical_issuer.issue_identifier(node)
129
+ log_debug(" - blank node") {node.id}
130
+ log_debug(" canonical identifier", id)
131
+ end
132
+ end
133
+ end
134
+
135
+ # Step 6: Yield statements using BNodes from canonical replacements
136
+ if block_given?
137
+ dataset.each_statement do |statement|
138
+ if statement.has_blank_nodes?
139
+ quad = statement.to_quad.compact.map do |term|
140
+ term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
141
+ end
142
+ block.call RDF::Statement.from(quad)
143
+ else
144
+ block.call statement
145
+ end
146
+ end
147
+ end
148
+
149
+ log_debug("ca.6:")
150
+ log_debug(" log point", "Issued identifiers map (4.4.3 (6)).")
151
+ log_debug(" issued identifiers map: #{ns.canonical_issuer.inspect}")
152
+ dataset
153
+ end
154
+
155
+ private
156
+
157
+ class NormalizationState
158
+ include RDF::Util::Logger
159
+
160
+ attr_accessor :bnode_to_statements
161
+ attr_accessor :hash_to_bnodes
162
+ attr_accessor :canonical_issuer
163
+ attr_accessor :max_calls
164
+ attr_accessor :total_calls
165
+
166
+ def initialize(**options)
167
+ @options = options
168
+ @bnode_to_statements, @hash_to_bnodes, @canonical_issuer = {}, {}, IdentifierIssuer.new("c14n")
169
+ @max_calls, @total_calls = nil, 0
170
+ end
171
+
172
+ def add_statement(node, statement)
173
+ bnode_to_statements[node] ||= []
174
+ bnode_to_statements[node] << statement unless bnode_to_statements[node].any? {|st| st.eql?(statement)}
175
+ end
176
+
177
+ def add_bnode_hash(node, hash)
178
+ hash_to_bnodes[hash] ||= []
179
+ # Match on object IDs of nodes, rather than simple node equality
180
+ hash_to_bnodes[hash] << node unless hash_to_bnodes[hash].any? {|n| n.eql?(node)}
181
+ end
182
+
183
+ # This algorithm calculates a hash for a given blank node across the quads in a dataset in which that blank node is a component. If the hash uniquely identifies that blank node, no further examination is necessary. Otherwise, a hash will be created for the blank node using the algorithm in [4.9 Hash N-Degree Quads](https://w3c.github.io/rdf-canon/spec/#hash-nd-quads) invoked via [4.5 Canonicalization Algorithm](https://w3c.github.io/rdf-canon/spec/#canon-algorithm).
184
+ #
185
+ # @param [RDF::Node] node The reference blank node identifier
186
+ # @return [String] the SHA256 hexdigest hash of statements using this node, with replacements
187
+ def hash_first_degree_quads(node)
188
+ nquads = bnode_to_statements[node].
189
+ map do |statement|
190
+ quad = statement.to_quad.map do |t|
191
+ case t
192
+ when node then RDF::Node("a")
193
+ when RDF::Node then RDF::Node("z")
194
+ else t
195
+ end
196
+ end
197
+ RDF::Statement.from(quad).to_nquads
198
+ end
199
+ log_debug("log point", "Hash First Degree Quads function (4.7.3).")
200
+ log_debug("nquads:")
201
+ nquads.each do |q|
202
+ log_debug {" - #{q.strip}"}
203
+ end
204
+
205
+ result = hexdigest(nquads.sort.join)
206
+ log_debug("hash") {result}
207
+ result
208
+ end
209
+
210
+ # @param [RDF::Node] related
211
+ # @param [RDF::Statement] statement
212
+ # @param [IdentifierIssuer] issuer
213
+ # @param [String] position one of :s, :o, or :g
214
+ # @return [String] the SHA256 hexdigest hash
215
+ def hash_related_node(related, statement, issuer, position)
216
+ log_debug("related") {related.id}
217
+ input = "#{position}"
218
+ input << statement.predicate.to_ntriples unless position == :g
219
+ if identifier = (canonical_issuer.identifier(related) ||
220
+ issuer.identifier(related))
221
+ input << "_:#{identifier}"
222
+ else
223
+ log_debug("h1dq:")
224
+ input << log_depth(depth: 2) do
225
+ hash_first_degree_quads(related)
226
+ end
227
+ end
228
+ log_debug("input") {input.inspect}
229
+ log_debug("hash") {hexdigest(input)}
230
+ hexdigest(input)
231
+ end
232
+
233
+ # @param [RDF::Node] node
234
+ # @param [IdentifierIssuer] issuer
235
+ # @return [Array<String,IdentifierIssuer>] the Hash and issuer
236
+ # @raise [RuntimeError] If total number of calls has exceeded `max_calls` times the number of blank nodes in the dataset.
237
+ def hash_n_degree_quads(node, issuer)
238
+ log_debug("hndq:")
239
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3).")
240
+ log_debug(" identifier") {node.id}
241
+ log_debug(" issuer") {issuer.inspect}
242
+
243
+ if max_calls && total_calls >= max_calls
244
+ raise "Exceeded maximum number of calls (#{total_calls}) allowed to hash_n_degree_quads"
245
+ end
246
+ @total_calls += 1
247
+
248
+ # hash to related blank nodes map
249
+ hn = {}
250
+
251
+ log_debug(" hndq.2:")
252
+ log_debug(" log point", "Quads for identifier (4.9.3 (2)).")
253
+ log_debug(" quads:")
254
+ bnode_to_statements[node].each do |s|
255
+ log_debug {" - #{s.to_nquads.strip}"}
256
+ end
257
+
258
+ # Step 3
259
+ log_debug(" hndq.3:")
260
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (3)).")
261
+ log_debug(" with:") unless bnode_to_statements[node].empty?
262
+ bnode_to_statements[node].each do |statement|
263
+ log_debug {" - quad: #{statement.to_nquads.strip}"}
264
+ log_debug(" hndq.3.1:")
265
+ log_debug(" log point", "Hash related bnode component (4.9.3 (3.1))")
266
+ log_depth(depth: 10) {hash_related_statement(node, statement, issuer, hn)}
267
+ end
268
+ log_debug(" Hash to bnodes:")
269
+ hn.each do |k,v|
270
+ log_debug(" #{k}:")
271
+ v.each do |vv|
272
+ log_debug(" - #{vv.id}")
273
+ end
274
+ end
275
+
276
+ data_to_hash = ""
277
+
278
+ # Step 5
279
+ log_debug(" hndq.5:")
280
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (5)), entering loop.")
281
+ log_debug(" with:")
282
+ hn.keys.sort.each do |hash|
283
+ log_debug(" - related hash", hash)
284
+ log_debug(" data to hash") {data_to_hash.to_json}
285
+ list = hn[hash]
286
+ # Iterate over related nodes
287
+ chosen_path, chosen_issuer = "", nil
288
+ data_to_hash += hash
289
+
290
+ log_debug(" hndq.5.4:")
291
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (5.4)), entering loop.")
292
+ log_debug(" with:") unless list.empty?
293
+ list.permutation do |permutation|
294
+ log_debug(" - perm") {permutation.map(&:id).to_json(indent: ' ', space: ' ')}
295
+ issuer_copy, path, recursion_list = issuer.dup, "", []
296
+
297
+ log_debug(" hndq.5.4.4:")
298
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (5.4.4)), entering loop.")
299
+ log_debug(" with:")
300
+ permutation.each do |related|
301
+ log_debug(" - related") {related.id}
302
+ log_debug(" path") {path.to_json}
303
+ if canonical_issuer.identifier(related)
304
+ path << '_:' + canonical_issuer.issue_identifier(related)
305
+ else
306
+ recursion_list << related if !issuer_copy.identifier(related)
307
+ path << '_:' + issuer_copy.issue_identifier(related)
308
+ end
309
+
310
+ # Skip to the next permutation if chosen path isn't empty and the path is greater than the chosen path
311
+ break if !chosen_path.empty? && path.length >= chosen_path.length
312
+ end
313
+
314
+ log_debug(" hndq.5.4.5:")
315
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (5.4.5)), before possible recursion.")
316
+ log_debug(" recursion list") {recursion_list.map(&:id).to_json(indent: ' ')}
317
+ log_debug(" path") {path.to_json}
318
+ log_debug(" with:") unless recursion_list.empty?
319
+ recursion_list.each do |related|
320
+ log_debug(" - related") {related.id}
321
+ result = log_depth(depth: 18) do
322
+ hash_n_degree_quads(related, issuer_copy)
323
+ end
324
+ path << '_:' + issuer_copy.issue_identifier(related)
325
+ path << "<#{result.first}>"
326
+ issuer_copy = result.last
327
+ log_debug(" hndq.5.4.5.4:")
328
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (5.4.5.4)), combine result of recursion.")
329
+ log_debug(" path") {path.to_json}
330
+ log_debug(" issuer copy") {issuer_copy.inspect}
331
+ break if !chosen_path.empty? && path.length >= chosen_path.length && path > chosen_path
332
+ end
333
+
334
+ if chosen_path.empty? || path < chosen_path
335
+ chosen_path, chosen_issuer = path, issuer_copy
336
+ end
337
+ end
338
+
339
+ data_to_hash += chosen_path
340
+ log_debug(" hndq.5.5:")
341
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (5.5). End of current loop with Hn hashes.")
342
+ log_debug(" chosen path") {chosen_path.to_json}
343
+ log_debug(" data to hash") {data_to_hash.to_json}
344
+ issuer = chosen_issuer
345
+ end
346
+
347
+ log_debug(" hndq.6:")
348
+ log_debug(" log point", "Leaving Hash N-Degree Quads function (4.9.3).")
349
+ log_debug(" hash") {hexdigest(data_to_hash)}
350
+ log_depth(depth: 4) {log_debug("issuer") {issuer.inspect}}
351
+ return [hexdigest(data_to_hash), issuer]
352
+ end
353
+
354
+ def inspect
355
+ "NormalizationState:\nbnode_to_statements: #{inspect_bnode_to_statements}\nhash_to_bnodes: #{inspect_hash_to_bnodes}\ncanonical_issuer: #{canonical_issuer.inspect}"
356
+ end
357
+
358
+ def inspect_bnode_to_statements
359
+ bnode_to_statements.map do |n, statements|
360
+ "#{n.id}: #{statements.map {|s| s.to_nquads.strip}}"
361
+ end.join(", ")
362
+ end
363
+
364
+ def inspect_hash_to_bnodes
365
+ end
366
+
367
+ protected
368
+
369
+ def hexdigest(val)
370
+ Digest::SHA256.hexdigest(val)
371
+ end
372
+
373
+ # Group adjacent bnodes by hash
374
+ def hash_related_statement(node, statement, issuer, map)
375
+ log_debug("with:") if statement.to_h.values.any? {|t| t.is_a?(RDF::Node)}
376
+ statement.to_h(:s, :p, :o, :g).each do |pos, term|
377
+ next if !term.is_a?(RDF::Node) || term == node
378
+
379
+ log_debug(" - position", pos)
380
+ hash = log_depth(depth: 4) {hash_related_node(term, statement, issuer, pos)}
381
+ map[hash] ||= []
382
+ map[hash] << term unless map[hash].any? {|n| n.eql?(term)}
383
+ end
384
+ end
385
+ end
386
+
387
+ class IdentifierIssuer
388
+ def initialize(prefix = "c14n")
389
+ @prefix, @counter, @issued = prefix, 0, {}
390
+ end
391
+
392
+ # Return an identifier for this BNode
393
+ # @param [RDF::Node] node
394
+ # @return [String] Canonical identifier for node
395
+ def issue_identifier(node)
396
+ @issued[node] ||= begin
397
+ res, @counter = @prefix + @counter.to_s, @counter + 1
398
+ res
399
+ end
400
+ end
401
+
402
+ def issued
403
+ @issued.keys
404
+ end
405
+
406
+ # @return [RDF::Node] Canonical identifier assigned to node
407
+ def identifier(node)
408
+ @issued[node]
409
+ end
410
+
411
+ # @return [Hash{Symbol => Symbol}] the issued identifiers map
412
+ def to_hash
413
+ @issued.inject({}) {|memo, (node, canon)| memo.merge(node.id => canon)}
414
+ end
415
+
416
+ # Duplicate this issuer, ensuring that the issued identifiers remain distinct
417
+ # @return [IdentifierIssuer]
418
+ def dup
419
+ other = super
420
+ other.instance_variable_set(:@issued, @issued.dup)
421
+ other
422
+ end
423
+
424
+ def inspect
425
+ "{#{@issued.map {|k,v| "#{k.id}: #{v}"}.join(', ')}}"
426
+ end
427
+ end
428
+ end
429
+ end
@@ -1,12 +1,18 @@
1
1
  module RDF::Normalize
2
- class URGNA2012 < URDNA2015
2
+ class URGNA2012 < RDFC10
3
3
 
4
4
  def each(&block)
5
- ns = NormalizationState.new(@options)
5
+ ns = NormalizationState.new(**@options)
6
6
  normalize_statements(ns, &block)
7
7
  end
8
8
 
9
- class NormalizationState < URDNA2015::NormalizationState
9
+ def to_hash
10
+ ns = NormalizationState.new(**@options)
11
+ normalize_statements(ns)
12
+ ns.canonical_issuer.to_h
13
+ end
14
+
15
+ class NormalizationState < RDFC10::NormalizationState
10
16
  protected
11
17
 
12
18
  # 2012 version uses SHA-1
@@ -53,7 +53,7 @@ module RDF::Normalize
53
53
  #
54
54
  # @return [void]
55
55
  def write_epilogue
56
- statements = RDF::Normalize.new(@repo, **@options).
56
+ RDF::Normalize.new(@repo, **@options).
57
57
  statements.
58
58
  reject(&:variable?).
59
59
  map {|s| format_statement(s)}.
data/lib/rdf/normalize.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'rdf'
2
+ require 'digest'
2
3
 
3
4
  module RDF
4
5
  ##
@@ -31,7 +32,7 @@ module RDF
31
32
  autoload :Base, 'rdf/normalize/base'
32
33
  autoload :Carroll2001,'rdf/normalize/carroll2001'
33
34
  autoload :URGNA2012, 'rdf/normalize/urgna2012'
34
- autoload :URDNA2015, 'rdf/normalize/urdna2015'
35
+ autoload :RDFC10, 'rdf/normalize/rdfc10'
35
36
  autoload :VERSION, 'rdf/normalize/version'
36
37
  autoload :Writer, 'rdf/normalize/writer'
37
38
 
@@ -42,19 +43,23 @@ module RDF
42
43
  ALGORITHMS = {
43
44
  carroll2001: :Carroll2001,
44
45
  urgna2012: :URGNA2012,
45
- urdna2015: :URDNA2015
46
+ rdfc10: :RDFC10
46
47
  }.freeze
47
48
 
48
49
  ##
49
50
  # Creates a new normalizer instance using either the specified or default normalizer algorithm
50
51
  # @param [RDF::Enumerable] enumerable
51
52
  # @param [Hash{Symbol => Object}] options
52
- # @option options [Base] :algorithm (:urdna2015)
53
- # One of `:carroll2001`, `:urgna2012`, or `:urdna2015`
53
+ # @option options [Base] :algorithm (:rdfc10)
54
+ # One of `:carroll2001`, `:urgna2012`, or `:rdfc10`
55
+ # @option options [Integer] :max_calls
56
+ # Maximum number of calls allowed for recursive blank node labeling,
57
+ # as a multiple of the total number of blank nodes in the dataset.
58
+ # @option options [Boolean] :identifier_map
54
59
  # @return [RDF::Normalize::Base]
55
60
  # @raise [ArgumentError] selected algorithm not defined
56
61
  def new(enumerable, **options)
57
- algorithm = options.fetch(:algorithm, :urdna2015)
62
+ algorithm = options.fetch(:algorithm, :rdfc10)
58
63
  raise ArgumentError, "No algoritm defined for #{algorithm.to_sym}" unless ALGORITHMS.has_key?(algorithm)
59
64
  algorithm_class = const_get(ALGORITHMS[algorithm])
60
65
  algorithm_class.new(enumerable, **options)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rdf-normalize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregg Kellogg
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-11-27 00:00:00.000000000 Z
11
+ date: 2023-07-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rdf
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '3.2'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rdf-trig
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.2'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.2'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: yard
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -94,7 +108,7 @@ files:
94
108
  - lib/rdf/normalize/base.rb
95
109
  - lib/rdf/normalize/carroll2001.rb
96
110
  - lib/rdf/normalize/format.rb
97
- - lib/rdf/normalize/urdna2015.rb
111
+ - lib/rdf/normalize/rdfc10.rb
98
112
  - lib/rdf/normalize/urgna2012.rb
99
113
  - lib/rdf/normalize/version.rb
100
114
  - lib/rdf/normalize/writer.rb
@@ -122,7 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
122
136
  - !ruby/object:Gem::Version
123
137
  version: '0'
124
138
  requirements: []
125
- rubygems_version: 3.3.7
139
+ rubygems_version: 3.4.13
126
140
  signing_key:
127
141
  specification_version: 4
128
142
  summary: RDF Graph normalizer for Ruby.
@@ -1,257 +0,0 @@
1
- module RDF::Normalize
2
- class URDNA2015
3
- include RDF::Enumerable
4
- include RDF::Util::Logger
5
- include Base
6
-
7
- ##
8
- # Create an enumerable with grounded nodes
9
- #
10
- # @param [RDF::Enumerable] enumerable
11
- # @return [RDF::Enumerable]
12
- def initialize(enumerable, **options)
13
- @dataset, @options = enumerable, options
14
- end
15
-
16
- def each(&block)
17
- ns = NormalizationState.new(@options)
18
- normalize_statements(ns, &block)
19
- end
20
-
21
- protected
22
- def normalize_statements(ns, &block)
23
- # Map BNodes to the statements they are used by
24
- dataset.each_statement do |statement|
25
- statement.to_quad.compact.select(&:node?).each do |node|
26
- ns.add_statement(node, statement)
27
- end
28
- end
29
-
30
- ns.hash_to_bnodes = {}
31
-
32
- # Calculate hashes for first degree nodes
33
- ns.bnode_to_statements.each_key do |node|
34
- hash = log_depth {ns.hash_first_degree_quads(node)}
35
- log_debug("1deg") {"hash: #{hash}"}
36
- ns.add_bnode_hash(node, hash)
37
- end
38
-
39
- # Create canonical replacements for hashes mapping to a single node
40
- ns.hash_to_bnodes.keys.sort.each do |hash|
41
- identifier_list = ns.hash_to_bnodes[hash]
42
- next if identifier_list.length > 1
43
- node = identifier_list.first
44
- id = ns.canonical_issuer.issue_identifier(node)
45
- log_debug("single node") {"node: #{node.to_ntriples}, hash: #{hash}, id: #{id}"}
46
- ns.hash_to_bnodes.delete(hash)
47
- end
48
-
49
- # Iterate over hashs having more than one node
50
- ns.hash_to_bnodes.keys.sort.each do |hash|
51
- identifier_list = ns.hash_to_bnodes[hash]
52
-
53
- log_debug("multiple nodes") {"node: #{identifier_list.map(&:to_ntriples).join(",")}, hash: #{hash}"}
54
- hash_path_list = []
55
-
56
- # Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements
57
- identifier_list.each do |identifier|
58
- next if ns.canonical_issuer.issued.include?(identifier)
59
- temporary_issuer = IdentifierIssuer.new("_:b")
60
- temporary_issuer.issue_identifier(identifier)
61
- hash_path_list << log_depth {ns.hash_n_degree_quads(identifier, temporary_issuer)}
62
- end
63
- log_debug("->") {"hash_path_list: #{hash_path_list.map(&:first).inspect}"}
64
-
65
- # Create canonical replacements for nodes
66
- hash_path_list.sort_by(&:first).map(&:last).each do |issuer|
67
- issuer.issued.each do |node|
68
- id = ns.canonical_issuer.issue_identifier(node)
69
- log_debug("-->") {"node: #{node.to_ntriples}, id: #{id}"}
70
- end
71
- end
72
- end
73
-
74
- # Yield statements using BNodes from canonical replacements
75
- dataset.each_statement do |statement|
76
- if statement.has_blank_nodes?
77
- quad = statement.to_quad.compact.map do |term|
78
- term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)[2..-1]) : term
79
- end
80
- block.call RDF::Statement.from(quad)
81
- else
82
- block.call statement
83
- end
84
- end
85
- end
86
-
87
- private
88
-
89
- class NormalizationState
90
- include RDF::Util::Logger
91
-
92
- attr_accessor :bnode_to_statements
93
- attr_accessor :hash_to_bnodes
94
- attr_accessor :canonical_issuer
95
-
96
- def initialize(options)
97
- @options = options
98
- @bnode_to_statements, @hash_to_bnodes, @canonical_issuer = {}, {}, IdentifierIssuer.new("_:c14n")
99
- end
100
-
101
- def add_statement(node, statement)
102
- bnode_to_statements[node] ||= []
103
- bnode_to_statements[node] << statement unless bnode_to_statements[node].any? {|st| st.eql?(statement)}
104
- end
105
-
106
- def add_bnode_hash(node, hash)
107
- hash_to_bnodes[hash] ||= []
108
- # Match on object IDs of nodes, rather than simple node equality
109
- hash_to_bnodes[hash] << node unless hash_to_bnodes[hash].any? {|n| n.eql?(node)}
110
- end
111
-
112
- # @param [RDF::Node] node
113
- # @return [String] the SHA256 hexdigest hash of statements using this node, with replacements
114
- def hash_first_degree_quads(node)
115
- quads = bnode_to_statements[node].
116
- map do |statement|
117
- quad = statement.to_quad.map do |t|
118
- case t
119
- when node then RDF::Node("a")
120
- when RDF::Node then RDF::Node("z")
121
- else t
122
- end
123
- end
124
- RDF::NQuads::Writer.serialize(RDF::Statement.from(quad))
125
- end
126
-
127
- log_debug("1deg") {"node: #{node}, quads: #{quads}"}
128
- hexdigest(quads.sort.join)
129
- end
130
-
131
- # @param [RDF::Node] related
132
- # @param [RDF::Statement] statement
133
- # @param [IdentifierIssuer] issuer
134
- # @param [String] position one of :s, :o, or :g
135
- # @return [String] the SHA256 hexdigest hash
136
- def hash_related_node(related, statement, issuer, position)
137
- identifier = canonical_issuer.identifier(related) ||
138
- issuer.identifier(related) ||
139
- hash_first_degree_quads(related)
140
- input = "#{position}"
141
- input << statement.predicate.to_ntriples unless position == :g
142
- input << identifier
143
- log_debug("hrel") {"input: #{input.inspect}, hash: #{hexdigest(input)}"}
144
- hexdigest(input)
145
- end
146
-
147
- # @param [RDF::Node] identifier
148
- # @param [IdentifierIssuer] issuer
149
- # @return [Array<String,IdentifierIssuer>] the Hash and issuer
150
- def hash_n_degree_quads(identifier, issuer)
151
- log_debug("ndeg") {"identifier: #{identifier.to_ntriples}"}
152
-
153
- # hash to related blank nodes map
154
- map = {}
155
-
156
- bnode_to_statements[identifier].each do |statement|
157
- hash_related_statement(identifier, statement, issuer, map)
158
- end
159
-
160
- data_to_hash = ""
161
-
162
- log_debug("ndeg") {"map: #{map.map {|h,l| "#{h}: #{l.map(&:to_ntriples)}"}.join('; ')}"}
163
- log_depth do
164
- map.keys.sort.each do |hash|
165
- list = map[hash]
166
- # Iterate over related nodes
167
- chosen_path, chosen_issuer = "", nil
168
- data_to_hash += hash
169
-
170
- list.permutation do |permutation|
171
- log_debug("ndeg") {"perm: #{permutation.map(&:to_ntriples).join(",")}"}
172
- issuer_copy, path, recursion_list = issuer.dup, "", []
173
-
174
- permutation.each do |related|
175
- if canonical_issuer.identifier(related)
176
- path << canonical_issuer.issue_identifier(related)
177
- else
178
- recursion_list << related if !issuer_copy.identifier(related)
179
- path << issuer_copy.issue_identifier(related)
180
- end
181
-
182
- # Skip to the next permutation if chosen path isn't empty and the path is greater than the chosen path
183
- break if !chosen_path.empty? && path.length >= chosen_path.length
184
- end
185
- log_debug("ndeg") {"hash: #{hash}, path: #{path}, recursion: #{recursion_list.map(&:to_ntriples)}"}
186
-
187
- recursion_list.each do |related|
188
- result = log_depth {hash_n_degree_quads(related, issuer_copy)}
189
- path << issuer_copy.issue_identifier(related)
190
- path << "<#{result.first}>"
191
- issuer_copy = result.last
192
- break if !chosen_path.empty? && path.length >= chosen_path.length && path > chosen_path
193
- end
194
-
195
- if chosen_path.empty? || path < chosen_path
196
- chosen_path, chosen_issuer = path, issuer_copy
197
- end
198
- end
199
-
200
- data_to_hash += chosen_path
201
- issuer = chosen_issuer
202
- end
203
- end
204
-
205
- log_debug("ndeg") {"datatohash: #{data_to_hash.inspect}, hash: #{hexdigest(data_to_hash)}"}
206
- return [hexdigest(data_to_hash), issuer]
207
- end
208
-
209
- protected
210
-
211
- def hexdigest(val)
212
- Digest::SHA256.hexdigest(val)
213
- end
214
-
215
- # Group adjacent bnodes by hash
216
- def hash_related_statement(identifier, statement, issuer, map)
217
- statement.to_h(:s, :p, :o, :g).each do |pos, term|
218
- next if !term.is_a?(RDF::Node) || term == identifier
219
-
220
- hash = log_depth {hash_related_node(term, statement, issuer, pos)}
221
- map[hash] ||= []
222
- map[hash] << term unless map[hash].any? {|n| n.eql?(term)}
223
- end
224
- end
225
- end
226
-
227
- class IdentifierIssuer
228
- def initialize(prefix = "_:c14n")
229
- @prefix, @counter, @issued = prefix, 0, {}
230
- end
231
-
232
- # Return an identifier for this BNode
233
- def issue_identifier(node)
234
- @issued[node] ||= begin
235
- res, @counter = @prefix + @counter.to_s, @counter + 1
236
- res
237
- end
238
- end
239
-
240
- def issued
241
- @issued.keys
242
- end
243
-
244
- def identifier(node)
245
- @issued[node]
246
- end
247
-
248
- # Duplicate this issuer, ensuring that the issued identifiers remain distinct
249
- # @return [IdentifierIssuer]
250
- def dup
251
- other = super
252
- other.instance_variable_set(:@issued, @issued.dup)
253
- other
254
- end
255
- end
256
- end
257
- end