rdf-normalize 0.5.1 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 75218bd6e68ada2e64c27fb691f0ff8b92cfd8b9fe9747f02d862bcbcdd8a5dd
4
- data.tar.gz: fbe6c3579b66435b0b2620178a3ebf46e1bd56f17180a7a64f081aab65a3c447
3
+ metadata.gz: 15478756de443574bde6120436faf09bec1f7e40dcfc60f39fc97af92e686738
4
+ data.tar.gz: d5617da52a4d7e3429452f691e4a9ccb7f6ac8bedcef6dd66583b1322e0b57f0
5
5
  SHA512:
6
- metadata.gz: 1363fb834466a6a643245a12776ab6c49cff62436af19ecd5d6dfe58feb83b49971868838dc1d9f687a997ee93af2b12b98a51eedc2f6846e1e0e74810eb4285
7
- data.tar.gz: 62fb2457433083c9b4181353bfa0ec8740a871f3f58b6ac83233a0dd6385b3556a7540c62b1041bab5ba2aee17a55234c70a323fedfc41db09912bf7d3675215
6
+ metadata.gz: 7c2ccd4449f12d5095702d19a8c1d27539aa5afa23c8b96ffcf6f43ee0d6d10fd763e2dbc98f2ef008ede3edc3fda1801eb6a1cd3ad0e80e3b82995017ae93e4
7
+ data.tar.gz: f760c7336703292679c82b6abbea86ffe7b8ac1b803508c187d8aee7bcd8cd635d0b039d928b7d145198f7df884027aeb911fa2e97e0e9d171cae92e4d26ed0b
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # RDF::Normalize
2
2
  RDF Graph normalizer for [RDF.rb][RDF.rb].
3
3
 
4
- [![Gem Version](https://badge.fury.io/rb/rdf-normalize.png)](https://badge.fury.io/rb/rdf-normalize)
4
+ [![Gem Version](https://badge.fury.io/rb/rdf-normalize.svg)](https://badge.fury.io/rb/rdf-normalize)
5
5
  [![Build Status](https://github.com/ruby-rdf/rdf-normalize/workflows/CI/badge.svg?branch=develop)](https://github.com/ruby-rdf/rdf-normalize/actions?query=workflow%3ACI)
6
6
  [![Coverage Status](https://coveralls.io/repos/ruby-rdf/rdf-normalize/badge.svg?branch=develop)](https://coveralls.io/github/ruby-rdf/rdf-normalize?branch=develop)
7
7
  [![Gitter chat](https://badges.gitter.im/ruby-rdf/rdf.png)](https://gitter.im/ruby-rdf/rdf)
@@ -17,7 +17,7 @@ to serialize normalized statements.
17
17
  Algorithms implemented:
18
18
 
19
19
  * [URGNA2012](https://www.w3.org/TR/rdf-canon/#dfn-urgna2012)
20
- * [URDNA2015](https://www.w3.org/TR/rdf-canon/#dfn-urdna2015)
20
+ * [RDFC-1.0](https://www.w3.org/TR/rdf-canon/#dfn-rdfc-1-0)
21
21
 
22
22
  Install with `gem install rdf-normalize`
23
23
 
@@ -45,7 +45,7 @@ Full documentation available on [GitHub][Normalize doc]
45
45
  * {RDF::Normalize::Format}
46
46
  * {RDF::Normalize::Writer}
47
47
  * {RDF::Normalize::URGNA2012}
48
- * {RDF::Normalize::URDNA2015}
48
+ * {RDF::Normalize::RDFC10}
49
49
 
50
50
  ## Dependencies
51
51
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.1
1
+ 0.6.0
@@ -0,0 +1,390 @@
1
+ require 'rdf/nquads'
2
+ begin
3
+ require 'json'
4
+ rescue LoadError
5
+ # Used for debug output
6
+ end
7
+
8
+ module RDF::Normalize
9
+ class RDFC10
10
+ include RDF::Enumerable
11
+ include RDF::Util::Logger
12
+ include Base
13
+
14
+ ##
15
+ # Create an enumerable with grounded nodes
16
+ #
17
+ # @param [RDF::Enumerable] enumerable
18
+ # @return [RDF::Enumerable]
19
+ def initialize(enumerable, **options)
20
+ @dataset, @options = enumerable, options
21
+ end
22
+
23
+ def each(&block)
24
+ ns = NormalizationState.new(@options)
25
+ log_debug("ca:")
26
+ log_debug(" log point", "Entering the canonicalization function (4.5.3).")
27
+ log_depth(depth: 2) {normalize_statements(ns, &block)}
28
+ end
29
+
30
+ protected
31
+ def normalize_statements(ns, &block)
32
+ # Step 2: Map BNodes to the statements they are used by
33
+ dataset.each_statement do |statement|
34
+ statement.to_quad.compact.select(&:node?).each do |node|
35
+ ns.add_statement(node, statement)
36
+ end
37
+ end
38
+ log_debug("ca.2:")
39
+ log_debug(" log point", "Extract quads for each bnode (4.5.3 (2)).")
40
+ log_debug(" Bnode to quads:")
41
+ if logger && logger.level == 0
42
+ ns.bnode_to_statements.each do |bn, statements|
43
+ log_debug(" #{bn.id}:")
44
+ statements.each do |s|
45
+ log_debug {" - #{s.to_nquads.strip}"}
46
+ end
47
+ end
48
+ end
49
+
50
+ ns.hash_to_bnodes = {}
51
+
52
+ # Step 3: Calculate hashes for first degree nodes
53
+ log_debug("ca.3:")
54
+ log_debug(" log point", "Calculated first degree hashes (4.5.3 (3)).")
55
+ log_debug(" with:")
56
+ ns.bnode_to_statements.each_key do |node|
57
+ log_debug(" - identifier") {node.id}
58
+ log_debug(" h1dq:")
59
+ hash = log_depth(depth: 8) {ns.hash_first_degree_quads(node)}
60
+ ns.add_bnode_hash(node, hash)
61
+ end
62
+
63
+ # Step 4: Create canonical replacements for hashes mapping to a single node
64
+ log_debug("ca.4:")
65
+ log_debug(" log point", "Create canonical replacements for hashes mapping to a single node (4.5.3 (4)).")
66
+ log_debug(" with:") unless ns.hash_to_bnodes.empty?
67
+ ns.hash_to_bnodes.keys.sort.each do |hash|
68
+ identifier_list = ns.hash_to_bnodes[hash]
69
+ next if identifier_list.length > 1
70
+ node = identifier_list.first
71
+ id = ns.canonical_issuer.issue_identifier(node)
72
+ log_debug(" - identifier") {node.id}
73
+ log_debug(" hash", hash)
74
+ log_debug(" canonical label", id)
75
+ ns.hash_to_bnodes.delete(hash)
76
+ end
77
+
78
+ # Step 5: Iterate over hashs having more than one node
79
+ log_debug("ca.5:") unless ns.hash_to_bnodes.empty?
80
+ log_debug(" log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5)).")
81
+ log_debug(" with:") unless ns.hash_to_bnodes.empty?
82
+ ns.hash_to_bnodes.keys.sort.each do |hash|
83
+ identifier_list = ns.hash_to_bnodes[hash]
84
+
85
+ log_debug(" - hash", hash)
86
+ log_debug(" identifier list") {identifier_list.map(&:id).to_json(indent: ' ')}
87
+ hash_path_list = []
88
+
89
+ # Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements
90
+ log_debug(" ca.5.2:")
91
+ log_debug(" log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5.2)).")
92
+ log_debug(" with:") unless identifier_list.empty?
93
+ identifier_list.each do |identifier|
94
+ next if ns.canonical_issuer.issued.include?(identifier)
95
+ temporary_issuer = IdentifierIssuer.new("b")
96
+ temporary_issuer.issue_identifier(identifier)
97
+ log_debug(" - identifier") {identifier.id}
98
+ hash_path_list << log_depth(depth: 12) {ns.hash_n_degree_quads(identifier, temporary_issuer)}
99
+ end
100
+
101
+ # Create canonical replacements for nodes
102
+ log_debug(" ca.5.3:") unless hash_path_list.empty?
103
+ log_debug(" log point", "Canonical identifiers for temporary identifiers (4.5.3 (5.3)).")
104
+ log_debug(" issuer:") unless hash_path_list.empty?
105
+ hash_path_list.sort_by(&:first).each do |result, issuer|
106
+ issuer.issued.each do |node|
107
+ id = ns.canonical_issuer.issue_identifier(node)
108
+ log_debug(" - blank node") {node.id}
109
+ log_debug(" canonical identifier", id)
110
+ end
111
+ end
112
+ end
113
+
114
+ # Step 6: Yield statements using BNodes from canonical replacements
115
+ dataset.each_statement do |statement|
116
+ if statement.has_blank_nodes?
117
+ quad = statement.to_quad.compact.map do |term|
118
+ term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
119
+ end
120
+ block.call RDF::Statement.from(quad)
121
+ else
122
+ block.call statement
123
+ end
124
+ end
125
+
126
+ log_debug("ca.6:")
127
+ log_debug(" log point", "Replace original with canonical labels (4.5.3 (6)).")
128
+ log_debug(" canonical issuer: #{ns.canonical_issuer.inspect}")
129
+ dataset
130
+ end
131
+
132
+ private
133
+
134
+ class NormalizationState
135
+ include RDF::Util::Logger
136
+
137
+ attr_accessor :bnode_to_statements
138
+ attr_accessor :hash_to_bnodes
139
+ attr_accessor :canonical_issuer
140
+
141
+ def initialize(options)
142
+ @options = options
143
+ @bnode_to_statements, @hash_to_bnodes, @canonical_issuer = {}, {}, IdentifierIssuer.new("c14n")
144
+ end
145
+
146
+ def add_statement(node, statement)
147
+ bnode_to_statements[node] ||= []
148
+ bnode_to_statements[node] << statement unless bnode_to_statements[node].any? {|st| st.eql?(statement)}
149
+ end
150
+
151
+ def add_bnode_hash(node, hash)
152
+ hash_to_bnodes[hash] ||= []
153
+ # Match on object IDs of nodes, rather than simple node equality
154
+ hash_to_bnodes[hash] << node unless hash_to_bnodes[hash].any? {|n| n.eql?(node)}
155
+ end
156
+
157
+ # This algorithm calculates a hash for a given blank node across the quads in a dataset in which that blank node is a component. If the hash uniquely identifies that blank node, no further examination is necessary. Otherwise, a hash will be created for the blank node using the algorithm in [4.9 Hash N-Degree Quads](https://w3c.github.io/rdf-canon/spec/#hash-nd-quads) invoked via [4.5 Canonicalization Algorithm](https://w3c.github.io/rdf-canon/spec/#canon-algorithm).
158
+ #
159
+ # @param [RDF::Node] node The reference blank node identifier
160
+ # @return [String] the SHA256 hexdigest hash of statements using this node, with replacements
161
+ def hash_first_degree_quads(node)
162
+ nquads = bnode_to_statements[node].
163
+ map do |statement|
164
+ quad = statement.to_quad.map do |t|
165
+ case t
166
+ when node then RDF::Node("a")
167
+ when RDF::Node then RDF::Node("z")
168
+ else t
169
+ end
170
+ end
171
+ RDF::Statement.from(quad).to_nquads
172
+ end
173
+ log_debug("log point", "Hash First Degree Quads function (4.7.3).")
174
+ log_debug("nquads:")
175
+ nquads.each do |q|
176
+ log_debug {" - #{q.strip}"}
177
+ end
178
+
179
+ result = hexdigest(nquads.sort.join)
180
+ log_debug("hash") {result}
181
+ result
182
+ end
183
+
184
+ # @param [RDF::Node] related
185
+ # @param [RDF::Statement] statement
186
+ # @param [IdentifierIssuer] issuer
187
+ # @param [String] position one of :s, :o, or :g
188
+ # @return [String] the SHA256 hexdigest hash
189
+ def hash_related_node(related, statement, issuer, position)
190
+ log_debug("related") {related.id}
191
+ input = "#{position}"
192
+ input << statement.predicate.to_ntriples unless position == :g
193
+ if identifier = (canonical_issuer.identifier(related) ||
194
+ issuer.identifier(related))
195
+ input << "_:#{identifier}"
196
+ else
197
+ log_debug("h1dq:")
198
+ input << log_depth(depth: 2) do
199
+ hash_first_degree_quads(related)
200
+ end
201
+ end
202
+ log_debug("input") {input.inspect}
203
+ log_debug("hash") {hexdigest(input)}
204
+ hexdigest(input)
205
+ end
206
+
207
+ # @param [RDF::Node] identifier
208
+ # @param [IdentifierIssuer] issuer
209
+ # @return [Array<String,IdentifierIssuer>] the Hash and issuer
210
+ def hash_n_degree_quads(identifier, issuer)
211
+ log_debug("hndq:")
212
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3).")
213
+ log_debug(" identifier") {identifier.id}
214
+ log_debug(" issuer") {issuer.inspect}
215
+
216
+ # hash to related blank nodes map
217
+ hn = {}
218
+
219
+ log_debug(" hndq.2:")
220
+ log_debug(" log point", "Quads for identifier (4.9.3 (2)).")
221
+ log_debug(" quads:")
222
+ bnode_to_statements[identifier].each do |s|
223
+ log_debug {" - #{s.to_nquads.strip}"}
224
+ end
225
+
226
+ # Step 3
227
+ log_debug(" hndq.3:")
228
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (3)).")
229
+ log_debug(" with:") unless bnode_to_statements[identifier].empty?
230
+ bnode_to_statements[identifier].each do |statement|
231
+ log_debug {" - quad: #{statement.to_nquads.strip}"}
232
+ log_debug(" hndq.3.1:")
233
+ log_debug(" log point", "Hash related bnode component (4.9.3 (3.1))")
234
+ log_depth(depth: 10) {hash_related_statement(identifier, statement, issuer, hn)}
235
+ end
236
+ log_debug(" Hash to bnodes:")
237
+ hn.each do |k,v|
238
+ log_debug(" #{k}:")
239
+ v.each do |vv|
240
+ log_debug(" - #{vv.id}")
241
+ end
242
+ end
243
+
244
+ data_to_hash = ""
245
+
246
+ # Step 5
247
+ log_debug(" hndq.5:")
248
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (5)), entering loop.")
249
+ log_debug(" with:")
250
+ hn.keys.sort.each do |hash|
251
+ log_debug(" - related hash", hash)
252
+ log_debug(" data to hash") {data_to_hash.to_json}
253
+ list = hn[hash]
254
+ # Iterate over related nodes
255
+ chosen_path, chosen_issuer = "", nil
256
+ data_to_hash += hash
257
+
258
+ log_debug(" hndq.5.4:")
259
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (5.4)), entering loop.")
260
+ log_debug(" with:") unless list.empty?
261
+ list.permutation do |permutation|
262
+ log_debug(" - perm") {permutation.map(&:id).to_json(indent: ' ', space: ' ')}
263
+ issuer_copy, path, recursion_list = issuer.dup, "", []
264
+
265
+ log_debug(" hndq.5.4.4:")
266
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (5.4.4)), entering loop.")
267
+ log_debug(" with:")
268
+ permutation.each do |related|
269
+ log_debug(" - related") {related.id}
270
+ log_debug(" path") {path.to_json}
271
+ if canonical_issuer.identifier(related)
272
+ path << '_:' + canonical_issuer.issue_identifier(related)
273
+ else
274
+ recursion_list << related if !issuer_copy.identifier(related)
275
+ path << '_:' + issuer_copy.issue_identifier(related)
276
+ end
277
+
278
+ # Skip to the next permutation if chosen path isn't empty and the path is greater than the chosen path
279
+ break if !chosen_path.empty? && path.length >= chosen_path.length
280
+ end
281
+
282
+ log_debug(" hndq.5.4.5:")
283
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (5.4.5)), before possible recursion.")
284
+ log_debug(" recursion list") {recursion_list.map(&:id).to_json(indent: ' ')}
285
+ log_debug(" path") {path.to_json}
286
+ log_debug(" with:") unless recursion_list.empty?
287
+ recursion_list.each do |related|
288
+ log_debug(" - related") {related.id}
289
+ result = log_depth(depth: 18) {hash_n_degree_quads(related, issuer_copy)}
290
+ path << '_:' + issuer_copy.issue_identifier(related)
291
+ path << "<#{result.first}>"
292
+ issuer_copy = result.last
293
+ log_debug(" hndq.5.4.5.4:")
294
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (5.4.5.4)), combine result of recursion.")
295
+ log_debug(" path") {path.to_json}
296
+ log_debug(" issuer copy") {issuer_copy.inspect}
297
+ break if !chosen_path.empty? && path.length >= chosen_path.length && path > chosen_path
298
+ end
299
+
300
+ if chosen_path.empty? || path < chosen_path
301
+ chosen_path, chosen_issuer = path, issuer_copy
302
+ end
303
+ end
304
+
305
+ data_to_hash += chosen_path
306
+ log_debug(" hndq.5.5:")
307
+ log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (5.5). End of current loop with Hn hashes.")
308
+ log_debug(" chosen path") {chosen_path.to_json}
309
+ log_debug(" data to hash") {data_to_hash.to_json}
310
+ issuer = chosen_issuer
311
+ end
312
+
313
+ log_debug(" hndq.6:")
314
+ log_debug(" log point", "Leaving Hash N-Degree Quads function (4.9.3).")
315
+ log_debug(" hash") {hexdigest(data_to_hash)}
316
+ log_depth(depth: 4) {log_debug("issuer") {issuer.inspect}}
317
+ return [hexdigest(data_to_hash), issuer]
318
+ end
319
+
320
+ def inspect
321
+ "NormalizationState:\nbnode_to_statements: #{inspect_bnode_to_statements}\nhash_to_bnodes: #{inspect_hash_to_bnodes}\ncanonical_issuer: #{canonical_issuer.inspect}"
322
+ end
323
+
324
+ def inspect_bnode_to_statements
325
+ bnode_to_statements.map do |n, statements|
326
+ "#{n.id}: #{statements.map {|s| s.to_nquads.strip}}"
327
+ end.join(", ")
328
+ end
329
+
330
+ def inspect_hash_to_bnodes
331
+ end
332
+
333
+ protected
334
+
335
+ def hexdigest(val)
336
+ Digest::SHA256.hexdigest(val)
337
+ end
338
+
339
+ # Group adjacent bnodes by hash
340
+ def hash_related_statement(identifier, statement, issuer, map)
341
+ log_debug("with:") if statement.to_h.values.any? {|t| t.is_a?(RDF::Node)}
342
+ statement.to_h(:s, :p, :o, :g).each do |pos, term|
343
+ next if !term.is_a?(RDF::Node) || term == identifier
344
+
345
+ log_debug(" - position", pos)
346
+ hash = log_depth(depth: 4) {hash_related_node(term, statement, issuer, pos)}
347
+ map[hash] ||= []
348
+ map[hash] << term unless map[hash].any? {|n| n.eql?(term)}
349
+ end
350
+ end
351
+ end
352
+
353
+ class IdentifierIssuer
354
+ def initialize(prefix = "c14n")
355
+ @prefix, @counter, @issued = prefix, 0, {}
356
+ end
357
+
358
+ # Return an identifier for this BNode
359
+ # @param [RDF::Node] node
360
+ # @return [String] Canonical identifier for node
361
+ def issue_identifier(node)
362
+ @issued[node] ||= begin
363
+ res, @counter = @prefix + @counter.to_s, @counter + 1
364
+ res
365
+ end
366
+ end
367
+
368
+ def issued
369
+ @issued.keys
370
+ end
371
+
372
+ # @return [RDF::Node] Canonical identifier assigned to node
373
+ def identifier(node)
374
+ @issued[node]
375
+ end
376
+
377
+ # Duplicate this issuer, ensuring that the issued identifiers remain distinct
378
+ # @return [IdentifierIssuer]
379
+ def dup
380
+ other = super
381
+ other.instance_variable_set(:@issued, @issued.dup)
382
+ other
383
+ end
384
+
385
+ def inspect
386
+ "{#{@issued.map {|k,v| "#{k.id}: #{v}"}.join(', ')}}"
387
+ end
388
+ end
389
+ end
390
+ end
@@ -1,12 +1,12 @@
1
1
  module RDF::Normalize
2
- class URGNA2012 < URDNA2015
2
+ class URGNA2012 < RDFC10
3
3
 
4
4
  def each(&block)
5
5
  ns = NormalizationState.new(@options)
6
6
  normalize_statements(ns, &block)
7
7
  end
8
8
 
9
- class NormalizationState < URDNA2015::NormalizationState
9
+ class NormalizationState < RDFC10::NormalizationState
10
10
  protected
11
11
 
12
12
  # 2012 version uses SHA-1
@@ -53,7 +53,7 @@ module RDF::Normalize
53
53
  #
54
54
  # @return [void]
55
55
  def write_epilogue
56
- statements = RDF::Normalize.new(@repo, **@options).
56
+ RDF::Normalize.new(@repo, **@options).
57
57
  statements.
58
58
  reject(&:variable?).
59
59
  map {|s| format_statement(s)}.
data/lib/rdf/normalize.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'rdf'
2
+ require 'digest'
2
3
 
3
4
  module RDF
4
5
  ##
@@ -31,7 +32,7 @@ module RDF
31
32
  autoload :Base, 'rdf/normalize/base'
32
33
  autoload :Carroll2001,'rdf/normalize/carroll2001'
33
34
  autoload :URGNA2012, 'rdf/normalize/urgna2012'
34
- autoload :URDNA2015, 'rdf/normalize/urdna2015'
35
+ autoload :RDFC10, 'rdf/normalize/rdfc10'
35
36
  autoload :VERSION, 'rdf/normalize/version'
36
37
  autoload :Writer, 'rdf/normalize/writer'
37
38
 
@@ -42,19 +43,19 @@ module RDF
42
43
  ALGORITHMS = {
43
44
  carroll2001: :Carroll2001,
44
45
  urgna2012: :URGNA2012,
45
- urdna2015: :URDNA2015
46
+ rdfc10: :RDFC10
46
47
  }.freeze
47
48
 
48
49
  ##
49
50
  # Creates a new normalizer instance using either the specified or default normalizer algorithm
50
51
  # @param [RDF::Enumerable] enumerable
51
52
  # @param [Hash{Symbol => Object}] options
52
- # @option options [Base] :algorithm (:urdna2015)
53
- # One of `:carroll2001`, `:urgna2012`, or `:urdna2015`
53
+ # @option options [Base] :algorithm (:rdfc10)
54
+ # One of `:carroll2001`, `:urgna2012`, or `:rdfc10`
54
55
  # @return [RDF::Normalize::Base]
55
56
  # @raise [ArgumentError] selected algorithm not defined
56
57
  def new(enumerable, **options)
57
- algorithm = options.fetch(:algorithm, :urdna2015)
58
+ algorithm = options.fetch(:algorithm, :rdfc10)
58
59
  raise ArgumentError, "No algoritm defined for #{algorithm.to_sym}" unless ALGORITHMS.has_key?(algorithm)
59
60
  algorithm_class = const_get(ALGORITHMS[algorithm])
60
61
  algorithm_class.new(enumerable, **options)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rdf-normalize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregg Kellogg
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-11-27 00:00:00.000000000 Z
11
+ date: 2023-06-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rdf
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '3.2'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rdf-trig
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.2'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.2'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: yard
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -94,7 +108,7 @@ files:
94
108
  - lib/rdf/normalize/base.rb
95
109
  - lib/rdf/normalize/carroll2001.rb
96
110
  - lib/rdf/normalize/format.rb
97
- - lib/rdf/normalize/urdna2015.rb
111
+ - lib/rdf/normalize/rdfc10.rb
98
112
  - lib/rdf/normalize/urgna2012.rb
99
113
  - lib/rdf/normalize/version.rb
100
114
  - lib/rdf/normalize/writer.rb
@@ -122,7 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
122
136
  - !ruby/object:Gem::Version
123
137
  version: '0'
124
138
  requirements: []
125
- rubygems_version: 3.3.7
139
+ rubygems_version: 3.4.13
126
140
  signing_key:
127
141
  specification_version: 4
128
142
  summary: RDF Graph normalizer for Ruby.
@@ -1,257 +0,0 @@
1
- module RDF::Normalize
2
- class URDNA2015
3
- include RDF::Enumerable
4
- include RDF::Util::Logger
5
- include Base
6
-
7
- ##
8
- # Create an enumerable with grounded nodes
9
- #
10
- # @param [RDF::Enumerable] enumerable
11
- # @return [RDF::Enumerable]
12
- def initialize(enumerable, **options)
13
- @dataset, @options = enumerable, options
14
- end
15
-
16
- def each(&block)
17
- ns = NormalizationState.new(@options)
18
- normalize_statements(ns, &block)
19
- end
20
-
21
- protected
22
- def normalize_statements(ns, &block)
23
- # Map BNodes to the statements they are used by
24
- dataset.each_statement do |statement|
25
- statement.to_quad.compact.select(&:node?).each do |node|
26
- ns.add_statement(node, statement)
27
- end
28
- end
29
-
30
- ns.hash_to_bnodes = {}
31
-
32
- # Calculate hashes for first degree nodes
33
- ns.bnode_to_statements.each_key do |node|
34
- hash = log_depth {ns.hash_first_degree_quads(node)}
35
- log_debug("1deg") {"hash: #{hash}"}
36
- ns.add_bnode_hash(node, hash)
37
- end
38
-
39
- # Create canonical replacements for hashes mapping to a single node
40
- ns.hash_to_bnodes.keys.sort.each do |hash|
41
- identifier_list = ns.hash_to_bnodes[hash]
42
- next if identifier_list.length > 1
43
- node = identifier_list.first
44
- id = ns.canonical_issuer.issue_identifier(node)
45
- log_debug("single node") {"node: #{node.to_ntriples}, hash: #{hash}, id: #{id}"}
46
- ns.hash_to_bnodes.delete(hash)
47
- end
48
-
49
- # Iterate over hashs having more than one node
50
- ns.hash_to_bnodes.keys.sort.each do |hash|
51
- identifier_list = ns.hash_to_bnodes[hash]
52
-
53
- log_debug("multiple nodes") {"node: #{identifier_list.map(&:to_ntriples).join(",")}, hash: #{hash}"}
54
- hash_path_list = []
55
-
56
- # Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements
57
- identifier_list.each do |identifier|
58
- next if ns.canonical_issuer.issued.include?(identifier)
59
- temporary_issuer = IdentifierIssuer.new("_:b")
60
- temporary_issuer.issue_identifier(identifier)
61
- hash_path_list << log_depth {ns.hash_n_degree_quads(identifier, temporary_issuer)}
62
- end
63
- log_debug("->") {"hash_path_list: #{hash_path_list.map(&:first).inspect}"}
64
-
65
- # Create canonical replacements for nodes
66
- hash_path_list.sort_by(&:first).map(&:last).each do |issuer|
67
- issuer.issued.each do |node|
68
- id = ns.canonical_issuer.issue_identifier(node)
69
- log_debug("-->") {"node: #{node.to_ntriples}, id: #{id}"}
70
- end
71
- end
72
- end
73
-
74
- # Yield statements using BNodes from canonical replacements
75
- dataset.each_statement do |statement|
76
- if statement.has_blank_nodes?
77
- quad = statement.to_quad.compact.map do |term|
78
- term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)[2..-1]) : term
79
- end
80
- block.call RDF::Statement.from(quad)
81
- else
82
- block.call statement
83
- end
84
- end
85
- end
86
-
87
- private
88
-
89
- class NormalizationState
90
- include RDF::Util::Logger
91
-
92
- attr_accessor :bnode_to_statements
93
- attr_accessor :hash_to_bnodes
94
- attr_accessor :canonical_issuer
95
-
96
- def initialize(options)
97
- @options = options
98
- @bnode_to_statements, @hash_to_bnodes, @canonical_issuer = {}, {}, IdentifierIssuer.new("_:c14n")
99
- end
100
-
101
- def add_statement(node, statement)
102
- bnode_to_statements[node] ||= []
103
- bnode_to_statements[node] << statement unless bnode_to_statements[node].any? {|st| st.eql?(statement)}
104
- end
105
-
106
- def add_bnode_hash(node, hash)
107
- hash_to_bnodes[hash] ||= []
108
- # Match on object IDs of nodes, rather than simple node equality
109
- hash_to_bnodes[hash] << node unless hash_to_bnodes[hash].any? {|n| n.eql?(node)}
110
- end
111
-
112
- # @param [RDF::Node] node
113
- # @return [String] the SHA256 hexdigest hash of statements using this node, with replacements
114
- def hash_first_degree_quads(node)
115
- quads = bnode_to_statements[node].
116
- map do |statement|
117
- quad = statement.to_quad.map do |t|
118
- case t
119
- when node then RDF::Node("a")
120
- when RDF::Node then RDF::Node("z")
121
- else t
122
- end
123
- end
124
- RDF::NQuads::Writer.serialize(RDF::Statement.from(quad))
125
- end
126
-
127
- log_debug("1deg") {"node: #{node}, quads: #{quads}"}
128
- hexdigest(quads.sort.join)
129
- end
130
-
131
- # @param [RDF::Node] related
132
- # @param [RDF::Statement] statement
133
- # @param [IdentifierIssuer] issuer
134
- # @param [String] position one of :s, :o, or :g
135
- # @return [String] the SHA256 hexdigest hash
136
- def hash_related_node(related, statement, issuer, position)
137
- identifier = canonical_issuer.identifier(related) ||
138
- issuer.identifier(related) ||
139
- hash_first_degree_quads(related)
140
- input = "#{position}"
141
- input << statement.predicate.to_ntriples unless position == :g
142
- input << identifier
143
- log_debug("hrel") {"input: #{input.inspect}, hash: #{hexdigest(input)}"}
144
- hexdigest(input)
145
- end
146
-
147
- # @param [RDF::Node] identifier
148
- # @param [IdentifierIssuer] issuer
149
- # @return [Array<String,IdentifierIssuer>] the Hash and issuer
150
- def hash_n_degree_quads(identifier, issuer)
151
- log_debug("ndeg") {"identifier: #{identifier.to_ntriples}"}
152
-
153
- # hash to related blank nodes map
154
- map = {}
155
-
156
- bnode_to_statements[identifier].each do |statement|
157
- hash_related_statement(identifier, statement, issuer, map)
158
- end
159
-
160
- data_to_hash = ""
161
-
162
- log_debug("ndeg") {"map: #{map.map {|h,l| "#{h}: #{l.map(&:to_ntriples)}"}.join('; ')}"}
163
- log_depth do
164
- map.keys.sort.each do |hash|
165
- list = map[hash]
166
- # Iterate over related nodes
167
- chosen_path, chosen_issuer = "", nil
168
- data_to_hash += hash
169
-
170
- list.permutation do |permutation|
171
- log_debug("ndeg") {"perm: #{permutation.map(&:to_ntriples).join(",")}"}
172
- issuer_copy, path, recursion_list = issuer.dup, "", []
173
-
174
- permutation.each do |related|
175
- if canonical_issuer.identifier(related)
176
- path << canonical_issuer.issue_identifier(related)
177
- else
178
- recursion_list << related if !issuer_copy.identifier(related)
179
- path << issuer_copy.issue_identifier(related)
180
- end
181
-
182
- # Skip to the next permutation if chosen path isn't empty and the path is greater than the chosen path
183
- break if !chosen_path.empty? && path.length >= chosen_path.length
184
- end
185
- log_debug("ndeg") {"hash: #{hash}, path: #{path}, recursion: #{recursion_list.map(&:to_ntriples)}"}
186
-
187
- recursion_list.each do |related|
188
- result = log_depth {hash_n_degree_quads(related, issuer_copy)}
189
- path << issuer_copy.issue_identifier(related)
190
- path << "<#{result.first}>"
191
- issuer_copy = result.last
192
- break if !chosen_path.empty? && path.length >= chosen_path.length && path > chosen_path
193
- end
194
-
195
- if chosen_path.empty? || path < chosen_path
196
- chosen_path, chosen_issuer = path, issuer_copy
197
- end
198
- end
199
-
200
- data_to_hash += chosen_path
201
- issuer = chosen_issuer
202
- end
203
- end
204
-
205
- log_debug("ndeg") {"datatohash: #{data_to_hash.inspect}, hash: #{hexdigest(data_to_hash)}"}
206
- return [hexdigest(data_to_hash), issuer]
207
- end
208
-
209
- protected
210
-
211
- def hexdigest(val)
212
- Digest::SHA256.hexdigest(val)
213
- end
214
-
215
- # Group adjacent bnodes by hash
216
- def hash_related_statement(identifier, statement, issuer, map)
217
- statement.to_h(:s, :p, :o, :g).each do |pos, term|
218
- next if !term.is_a?(RDF::Node) || term == identifier
219
-
220
- hash = log_depth {hash_related_node(term, statement, issuer, pos)}
221
- map[hash] ||= []
222
- map[hash] << term unless map[hash].any? {|n| n.eql?(term)}
223
- end
224
- end
225
- end
226
-
227
- class IdentifierIssuer
228
- def initialize(prefix = "_:c14n")
229
- @prefix, @counter, @issued = prefix, 0, {}
230
- end
231
-
232
- # Return an identifier for this BNode
233
- def issue_identifier(node)
234
- @issued[node] ||= begin
235
- res, @counter = @prefix + @counter.to_s, @counter + 1
236
- res
237
- end
238
- end
239
-
240
- def issued
241
- @issued.keys
242
- end
243
-
244
- def identifier(node)
245
- @issued[node]
246
- end
247
-
248
- # Duplicate this issuer, ensuring that the issued identifiers remain distinct
249
- # @return [IdentifierIssuer]
250
- def dup
251
- other = super
252
- other.instance_variable_set(:@issued, @issued.dup)
253
- other
254
- end
255
- end
256
- end
257
- end