rdf-normalize 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 15478756de443574bde6120436faf09bec1f7e40dcfc60f39fc97af92e686738
4
- data.tar.gz: d5617da52a4d7e3429452f691e4a9ccb7f6ac8bedcef6dd66583b1322e0b57f0
3
+ metadata.gz: 150cdddab40f368e1d1e68ebc65efe3990032729e8d9a591ef8436d61e81d057
4
+ data.tar.gz: 4510812f3e52b0159ec2025421d116d5c98d37840f3f87e25affb8392a5aa8b0
5
5
  SHA512:
6
- metadata.gz: 7c2ccd4449f12d5095702d19a8c1d27539aa5afa23c8b96ffcf6f43ee0d6d10fd763e2dbc98f2ef008ede3edc3fda1801eb6a1cd3ad0e80e3b82995017ae93e4
7
- data.tar.gz: f760c7336703292679c82b6abbea86ffe7b8ac1b803508c187d8aee7bcd8cd635d0b039d928b7d145198f7df884027aeb911fa2e97e0e9d171cae92e4d26ed0b
6
+ metadata.gz: ff3fd846a595da0df711bd83673498d259260a19f42f59a9d5f10e55a70670de7c0fba4b301d63ea725e2484179fd6b91b462ea943df9afe8c3d937660d06327
7
+ data.tar.gz: c66aa1ec1740e1d0c894d5ed9104b61819d055fc83aa2c6674fa1c34d2c9e62c395a78bb6aeec2b46c68c063fc6e79d55250e8d590a0f97c0624888488c4d60c
data/README.md CHANGED
@@ -22,7 +22,7 @@ Algorithms implemented:
22
22
  Install with `gem install rdf-normalize`
23
23
 
24
24
  * 100% free and unencumbered [public domain](https://unlicense.org/) software.
25
- * Compatible with Ruby >= 2.6.
25
+ * Compatible with Ruby >= 3.0.
26
26
 
27
27
  ## Usage
28
28
 
@@ -37,7 +37,14 @@ Full documentation available on [GitHub][Normalize doc]
37
37
  require 'rdf/normalize'
38
38
  require 'rdf/turtle'
39
39
  g = RDF::Graph.load("etc/doap.ttl")
40
- puts g.dump(:normalize)
40
+ puts g.dump(:normalize) # Can also use :canonicalize
41
+
42
+ ### Normalizing an abstract Graph/Dataset
43
+ require 'rdf/normalize'
44
+ require 'rdf/turtle'
45
+ g = RDF::Graph.load("etc/doap.ttl")
46
+ g_canon = g.canonicalize # graph with URIs, literals, and blank nodes canonicalized.
47
+ puts g_canon.dump(:nquads) # Normalized, but not sorted
41
48
 
42
49
  ### Principle Classes
43
50
  * {RDF::Normalize}
@@ -46,11 +53,13 @@ Full documentation available on [GitHub][Normalize doc]
46
53
  * {RDF::Normalize::Writer}
47
54
  * {RDF::Normalize::URGNA2012}
48
55
  * {RDF::Normalize::RDFC10}
56
+ * {RDF::Canonicalize} – extends {RDF::Normalize}
57
+ * {RDF::Canonicalize::Format}
49
58
 
50
59
  ## Dependencies
51
60
 
52
- * [Ruby](https://ruby-lang.org/) (>= 2.6)
53
- * [RDF.rb](https://rubygems.org/gems/rdf) (~> 3.2)
61
+ * [Ruby](https://ruby-lang.org/) (>= 3.0)
62
+ * [RDF.rb](https://rubygems.org/gems/rdf) (~> 3.3)
54
63
 
55
64
  ## Installation
56
65
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.6.0
1
+ 0.7.0
@@ -1,7 +1,7 @@
1
1
  module RDF::Normalize
2
2
  ##
3
3
  # Abstract class for pluggable normalization algorithms. Delegates to a default or selected algorithm if instantiated
4
- module Base
4
+ class Base
5
5
  attr_reader :dataset
6
6
 
7
7
  # Enumerates normalized statements
@@ -11,5 +11,12 @@ module RDF::Normalize
11
11
  def each(&block)
12
12
  raise "Not Implemented"
13
13
  end
14
+
15
+ # Returns a map from input blank node identifiers to canonical blank node identifiers.
16
+ #
17
+ # @return [Hash{String => String}]
18
+ def to_hash
19
+ raise "Not Implemented"
20
+ end
14
21
  end
15
22
  end
@@ -1,8 +1,6 @@
1
1
  module RDF::Normalize
2
- class Carroll2001
2
+ class Carroll2001 < Base
3
3
  include RDF::Enumerable
4
- include Base
5
- include Utils
6
4
 
7
5
  ##
8
6
  # Create an enumerable with grounded nodes
@@ -2,7 +2,18 @@ require 'rdf/nquads'
2
2
 
3
3
  module RDF::Normalize
4
4
  class Format < RDF::Format
5
- content_type 'application/normalized+n-quads', alias: 'application/x-normalized+n-quads'
5
+ content_type 'application/canonical+n-quads', alias: 'application/x-canonical+n-quads'
6
+ content_encoding 'utf-8'
7
+
8
+ # It reads like normal N-Quads
9
+ reader { RDF::NQuads::Reader}
10
+ writer { RDF::Normalize::Writer }
11
+ end
12
+ end
13
+
14
+ module RDF::Canonicalize
15
+ class Format < RDF::Format
16
+ content_type 'application/canonical+n-quads', alias: 'application/x-canonical+n-quads'
6
17
  content_encoding 'utf-8'
7
18
 
8
19
  # It reads like normal N-Quads
@@ -6,27 +6,49 @@ rescue LoadError
6
6
  end
7
7
 
8
8
  module RDF::Normalize
9
- class RDFC10
9
+ class RDFC10 < Base
10
10
  include RDF::Enumerable
11
11
  include RDF::Util::Logger
12
- include Base
13
12
 
14
13
  ##
15
14
  # Create an enumerable with grounded nodes
16
15
  #
17
16
  # @param [RDF::Enumerable] enumerable
17
+ # @option options [Integer] :max_calls (40)
18
+ # Maximum number of calls allowed for recursive blank node labeling,
19
+ # as a multiple of the total number of blank nodes in the dataset.
20
+ # @options options [:MD5, :SHA1, :SHA2, :SHA256, :SHA384, :SHA512] :hash_algorithm (:SHA256)
21
+ # See [Digest Algorithms](https://github.com/ruby/digest#digest-algorithms)
18
22
  # @return [RDF::Enumerable]
23
+ # raise [RuntimeError] if the maximum number of levels of recursion is exceeded.
19
24
  def initialize(enumerable, **options)
20
25
  @dataset, @options = enumerable, options
26
+ @options[:hash_algorithm] ||= :SHA256
27
+ unless %i{MD5 SHA1 SHA2 SHA256 SHA384 SHA512}.include?(@options[:hash_algorithm])
28
+ raise UnknownHashAlgorithm, "UnknownHashAlgorithm: #{@options[:hash_algorithm].inspect}. Use one of MD5, SHA1, SHA2, SHA256, SHA384, or SHA512"
29
+ end
21
30
  end
22
31
 
32
+ # Yields each normalized statement
23
33
  def each(&block)
24
- ns = NormalizationState.new(@options)
34
+ ns = NormalizationState.new(**@options)
25
35
  log_debug("ca:")
26
36
  log_debug(" log point", "Entering the canonicalization function (4.5.3).")
27
37
  log_depth(depth: 2) {normalize_statements(ns, &block)}
28
38
  end
29
39
 
40
+ # Returns a map from input blank node identifiers to canonical blank node identifiers.
41
+ #
42
+ # @return [Hash{String => String}]
43
+ def to_hash
44
+ ns = NormalizationState.new(**@options)
45
+ log_debug("ca:")
46
+ log_debug(" log point", "Entering the canonicalization function (4.5.3).")
47
+ log_depth(depth: 2) {normalize_statements(ns)}
48
+ ns.canonical_issuer.to_hash
49
+ end
50
+
51
+ #
30
52
  protected
31
53
  def normalize_statements(ns, &block)
32
54
  # Step 2: Map BNodes to the statements they are used by
@@ -79,6 +101,11 @@ module RDF::Normalize
79
101
  log_debug("ca.5:") unless ns.hash_to_bnodes.empty?
80
102
  log_debug(" log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5)).")
81
103
  log_debug(" with:") unless ns.hash_to_bnodes.empty?
104
+
105
+ # Initialize the number of calls allowed to hash_n_degree_quads
106
+ # as a multiple of the total number of blank nodes in the dataset.
107
+ ns.max_calls = ns.bnode_to_statements.keys.length * @options.fetch(:max_calls, 40)
108
+
82
109
  ns.hash_to_bnodes.keys.sort.each do |hash|
83
110
  identifier_list = ns.hash_to_bnodes[hash]
84
111
 
@@ -105,27 +132,29 @@ module RDF::Normalize
105
132
  hash_path_list.sort_by(&:first).each do |result, issuer|
106
133
  issuer.issued.each do |node|
107
134
  id = ns.canonical_issuer.issue_identifier(node)
108
- log_debug(" - blank node") {node.id}
109
- log_debug(" canonical identifier", id)
135
+ log_debug(" - blank node") {node.id}
136
+ log_debug(" canonical identifier", id)
110
137
  end
111
138
  end
112
139
  end
113
140
 
114
141
  # Step 6: Yield statements using BNodes from canonical replacements
115
- dataset.each_statement do |statement|
116
- if statement.has_blank_nodes?
117
- quad = statement.to_quad.compact.map do |term|
118
- term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
142
+ if block_given?
143
+ dataset.each_statement do |statement|
144
+ if statement.has_blank_nodes?
145
+ quad = statement.to_quad.compact.map do |term|
146
+ term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
147
+ end
148
+ block.call RDF::Statement.from(quad)
149
+ else
150
+ block.call statement
119
151
  end
120
- block.call RDF::Statement.from(quad)
121
- else
122
- block.call statement
123
152
  end
124
153
  end
125
154
 
126
155
  log_debug("ca.6:")
127
- log_debug(" log point", "Replace original with canonical labels (4.5.3 (6)).")
128
- log_debug(" canonical issuer: #{ns.canonical_issuer.inspect}")
156
+ log_debug(" log point", "Issued identifiers map (4.4.3 (6)).")
157
+ log_debug(" issued identifiers map: #{ns.canonical_issuer.inspect}")
129
158
  dataset
130
159
  end
131
160
 
@@ -135,12 +164,17 @@ module RDF::Normalize
135
164
  include RDF::Util::Logger
136
165
 
137
166
  attr_accessor :bnode_to_statements
167
+ attr_accessor :hash_algorithm
138
168
  attr_accessor :hash_to_bnodes
139
169
  attr_accessor :canonical_issuer
170
+ attr_accessor :max_calls
171
+ attr_accessor :total_calls
140
172
 
141
- def initialize(options)
173
+ def initialize(**options)
142
174
  @options = options
175
+ @hash_algorithm = Digest.const_get(options.fetch(:hash_algorithm, :SHA256))
143
176
  @bnode_to_statements, @hash_to_bnodes, @canonical_issuer = {}, {}, IdentifierIssuer.new("c14n")
177
+ @max_calls, @total_calls = nil, 0
144
178
  end
145
179
 
146
180
  def add_statement(node, statement)
@@ -204,34 +238,40 @@ module RDF::Normalize
204
238
  hexdigest(input)
205
239
  end
206
240
 
207
- # @param [RDF::Node] identifier
241
+ # @param [RDF::Node] node
208
242
  # @param [IdentifierIssuer] issuer
209
243
  # @return [Array<String,IdentifierIssuer>] the Hash and issuer
210
- def hash_n_degree_quads(identifier, issuer)
244
+ # @raise [MaxCallsExceeded] If total number of calls has exceeded `max_calls` times the number of blank nodes in the dataset.
245
+ def hash_n_degree_quads(node, issuer)
211
246
  log_debug("hndq:")
212
247
  log_debug(" log point", "Hash N-Degree Quads function (4.9.3).")
213
- log_debug(" identifier") {identifier.id}
248
+ log_debug(" identifier") {node.id}
214
249
  log_debug(" issuer") {issuer.inspect}
215
250
 
251
+ if max_calls && total_calls >= max_calls
252
+ raise MaxCallsExceeded, "Exceeded maximum number of calls (#{total_calls}) allowed to hash_n_degree_quads"
253
+ end
254
+ @total_calls += 1
255
+
216
256
  # hash to related blank nodes map
217
257
  hn = {}
218
258
 
219
259
  log_debug(" hndq.2:")
220
260
  log_debug(" log point", "Quads for identifier (4.9.3 (2)).")
221
261
  log_debug(" quads:")
222
- bnode_to_statements[identifier].each do |s|
262
+ bnode_to_statements[node].each do |s|
223
263
  log_debug {" - #{s.to_nquads.strip}"}
224
264
  end
225
265
 
226
266
  # Step 3
227
267
  log_debug(" hndq.3:")
228
268
  log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (3)).")
229
- log_debug(" with:") unless bnode_to_statements[identifier].empty?
230
- bnode_to_statements[identifier].each do |statement|
269
+ log_debug(" with:") unless bnode_to_statements[node].empty?
270
+ bnode_to_statements[node].each do |statement|
231
271
  log_debug {" - quad: #{statement.to_nquads.strip}"}
232
272
  log_debug(" hndq.3.1:")
233
273
  log_debug(" log point", "Hash related bnode component (4.9.3 (3.1))")
234
- log_depth(depth: 10) {hash_related_statement(identifier, statement, issuer, hn)}
274
+ log_depth(depth: 10) {hash_related_statement(node, statement, issuer, hn)}
235
275
  end
236
276
  log_debug(" Hash to bnodes:")
237
277
  hn.each do |k,v|
@@ -286,7 +326,9 @@ module RDF::Normalize
286
326
  log_debug(" with:") unless recursion_list.empty?
287
327
  recursion_list.each do |related|
288
328
  log_debug(" - related") {related.id}
289
- result = log_depth(depth: 18) {hash_n_degree_quads(related, issuer_copy)}
329
+ result = log_depth(depth: 18) do
330
+ hash_n_degree_quads(related, issuer_copy)
331
+ end
290
332
  path << '_:' + issuer_copy.issue_identifier(related)
291
333
  path << "<#{result.first}>"
292
334
  issuer_copy = result.last
@@ -333,14 +375,14 @@ module RDF::Normalize
333
375
  protected
334
376
 
335
377
  def hexdigest(val)
336
- Digest::SHA256.hexdigest(val)
378
+ hash_algorithm.hexdigest(val)
337
379
  end
338
380
 
339
381
  # Group adjacent bnodes by hash
340
- def hash_related_statement(identifier, statement, issuer, map)
382
+ def hash_related_statement(node, statement, issuer, map)
341
383
  log_debug("with:") if statement.to_h.values.any? {|t| t.is_a?(RDF::Node)}
342
384
  statement.to_h(:s, :p, :o, :g).each do |pos, term|
343
- next if !term.is_a?(RDF::Node) || term == identifier
385
+ next if !term.is_a?(RDF::Node) || term == node
344
386
 
345
387
  log_debug(" - position", pos)
346
388
  hash = log_depth(depth: 4) {hash_related_node(term, statement, issuer, pos)}
@@ -374,6 +416,11 @@ module RDF::Normalize
374
416
  @issued[node]
375
417
  end
376
418
 
419
+ # @return [Hash{Symbol => Symbol}] the issued identifiers map
420
+ def to_hash
421
+ @issued.inject({}) {|memo, (node, canon)| memo.merge(node.id => canon)}
422
+ end
423
+
377
424
  # Duplicate this issuer, ensuring that the issued identifiers remain distinct
378
425
  # @return [IdentifierIssuer]
379
426
  def dup
@@ -2,10 +2,16 @@ module RDF::Normalize
2
2
  class URGNA2012 < RDFC10
3
3
 
4
4
  def each(&block)
5
- ns = NormalizationState.new(@options)
5
+ ns = NormalizationState.new(**@options)
6
6
  normalize_statements(ns, &block)
7
7
  end
8
8
 
9
+ def to_hash
10
+ ns = NormalizationState.new(**@options)
11
+ normalize_statements(ns)
12
+ ns.canonical_issuer.to_h
13
+ end
14
+
9
15
  class NormalizationState < RDFC10::NormalizationState
10
16
  protected
11
17
 
data/lib/rdf/normalize.rb CHANGED
@@ -3,7 +3,7 @@ require 'digest'
3
3
 
4
4
  module RDF
5
5
  ##
6
- # **`RDF::Normalize`** is an RDF Graph normalization plugin for RDF.rb.
6
+ # **`RDF::Normalize`** is an RDF Graph canonicalization plugin for RDF.rb.
7
7
  #
8
8
  # @example Requiring the `RDF::Normalize` module
9
9
  # require 'rdf/normalize'
@@ -18,7 +18,7 @@ module RDF
18
18
  # @example Returning normalized N-Quads
19
19
  #
20
20
  # g = RDF::Graph.load("etc/doap.ttl")
21
- # g.dump(:normalize)
21
+ # g.dump(:normalize) # or :canonicalize
22
22
  #
23
23
  # @example Writing a repository as normalized N-Quads
24
24
  #
@@ -52,6 +52,10 @@ module RDF
52
52
  # @param [Hash{Symbol => Object}] options
53
53
  # @option options [Base] :algorithm (:rdfc10)
54
54
  # One of `:carroll2001`, `:urgna2012`, or `:rdfc10`
55
+ # @option options [Integer] :max_calls
56
+ # Maximum number of calls allowed for recursive blank node labeling,
57
+ # as a multiple of the total number of blank nodes in the dataset.
58
+ # @option options [Boolean] :identifier_map
55
59
  # @return [RDF::Normalize::Base]
56
60
  # @raise [ArgumentError] selected algorithm not defined
57
61
  def new(enumerable, **options)
@@ -62,5 +66,30 @@ module RDF
62
66
  end
63
67
  module_function :new
64
68
 
69
+ class MaxCallsExceeded < RuntimeError; end
70
+ class UnknownHashAlgorithm < RuntimeError; end
71
+ end
72
+
73
+ module Canonicalize
74
+ # RDF::Canonicalize extends RDF::Normalize.
75
+ include Normalize
76
+ end
77
+
78
+ # Change RDF::Enumerable#canonicalize
79
+ module Enumerable
80
+ ##
81
+ # Returns the resulting Enumerable result from RDF::Normalize.
82
+ # This also canonicalizes URIs and Literals.
83
+ #
84
+ # @return [RDF::Enumerable]
85
+ remove_method :canonicalize if method_defined? :canonicalize
86
+ def canonicalize
87
+ # Ensure that statements are queryable, countable and enumerable
88
+ this = self
89
+ enum = Enumerator.new do |yielder|
90
+ this.send(:each_statement) {|y| yielder << y.canonicalize}
91
+ end
92
+ RDF::Normalize.new(enum)
93
+ end
65
94
  end
66
95
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rdf-normalize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregg Kellogg
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-06-10 00:00:00.000000000 Z
11
+ date: 2023-09-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rdf
@@ -16,70 +16,70 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '3.2'
19
+ version: '3.3'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '3.2'
26
+ version: '3.3'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rdf-spec
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '3.2'
33
+ version: '3.3'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '3.2'
40
+ version: '3.3'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '3.10'
47
+ version: '3.12'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '3.10'
54
+ version: '3.12'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: json-ld
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '3.2'
61
+ version: '3.3'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '3.2'
68
+ version: '3.3'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rdf-trig
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '3.2'
75
+ version: '3.3'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '3.2'
82
+ version: '3.3'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: yard
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -129,14 +129,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
129
129
  requirements:
130
130
  - - ">="
131
131
  - !ruby/object:Gem::Version
132
- version: '2.6'
132
+ version: '3.0'
133
133
  required_rubygems_version: !ruby/object:Gem::Requirement
134
134
  requirements:
135
135
  - - ">="
136
136
  - !ruby/object:Gem::Version
137
137
  version: '0'
138
138
  requirements: []
139
- rubygems_version: 3.4.13
139
+ rubygems_version: 3.4.19
140
140
  signing_key:
141
141
  specification_version: 4
142
142
  summary: RDF Graph normalizer for Ruby.