rdf-normalize 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 15478756de443574bde6120436faf09bec1f7e40dcfc60f39fc97af92e686738
4
- data.tar.gz: d5617da52a4d7e3429452f691e4a9ccb7f6ac8bedcef6dd66583b1322e0b57f0
3
+ metadata.gz: 150cdddab40f368e1d1e68ebc65efe3990032729e8d9a591ef8436d61e81d057
4
+ data.tar.gz: 4510812f3e52b0159ec2025421d116d5c98d37840f3f87e25affb8392a5aa8b0
5
5
  SHA512:
6
- metadata.gz: 7c2ccd4449f12d5095702d19a8c1d27539aa5afa23c8b96ffcf6f43ee0d6d10fd763e2dbc98f2ef008ede3edc3fda1801eb6a1cd3ad0e80e3b82995017ae93e4
7
- data.tar.gz: f760c7336703292679c82b6abbea86ffe7b8ac1b803508c187d8aee7bcd8cd635d0b039d928b7d145198f7df884027aeb911fa2e97e0e9d171cae92e4d26ed0b
6
+ metadata.gz: ff3fd846a595da0df711bd83673498d259260a19f42f59a9d5f10e55a70670de7c0fba4b301d63ea725e2484179fd6b91b462ea943df9afe8c3d937660d06327
7
+ data.tar.gz: c66aa1ec1740e1d0c894d5ed9104b61819d055fc83aa2c6674fa1c34d2c9e62c395a78bb6aeec2b46c68c063fc6e79d55250e8d590a0f97c0624888488c4d60c
data/README.md CHANGED
@@ -22,7 +22,7 @@ Algorithms implemented:
22
22
  Install with `gem install rdf-normalize`
23
23
 
24
24
  * 100% free and unencumbered [public domain](https://unlicense.org/) software.
25
- * Compatible with Ruby >= 2.6.
25
+ * Compatible with Ruby >= 3.0.
26
26
 
27
27
  ## Usage
28
28
 
@@ -37,7 +37,14 @@ Full documentation available on [GitHub][Normalize doc]
37
37
  require 'rdf/normalize'
38
38
  require 'rdf/turtle'
39
39
  g = RDF::Graph.load("etc/doap.ttl")
40
- puts g.dump(:normalize)
40
+ puts g.dump(:normalize) # Can also use :canonicalize
41
+
42
+ ### Normalizing an abstract Graph/Dataset
43
+ require 'rdf/normalize'
44
+ require 'rdf/turtle'
45
+ g = RDF::Graph.load("etc/doap.ttl")
46
+ g_canon = g.canonicalize # graph with URIs, literals, and blank nodes canonicalized.
47
+ puts g_canon.dump(:nquads) # Normalized, but not sorted
41
48
 
42
49
  ### Principle Classes
43
50
  * {RDF::Normalize}
@@ -46,11 +53,13 @@ Full documentation available on [GitHub][Normalize doc]
46
53
  * {RDF::Normalize::Writer}
47
54
  * {RDF::Normalize::URGNA2012}
48
55
  * {RDF::Normalize::RDFC10}
56
+ * {RDF::Canonicalize} – extends {RDF::Normalize}
57
+ * {RDF::Canonicalize::Format}
49
58
 
50
59
  ## Dependencies
51
60
 
52
- * [Ruby](https://ruby-lang.org/) (>= 2.6)
53
- * [RDF.rb](https://rubygems.org/gems/rdf) (~> 3.2)
61
+ * [Ruby](https://ruby-lang.org/) (>= 3.0)
62
+ * [RDF.rb](https://rubygems.org/gems/rdf) (~> 3.3)
54
63
 
55
64
  ## Installation
56
65
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.6.0
1
+ 0.7.0
@@ -1,7 +1,7 @@
1
1
  module RDF::Normalize
2
2
  ##
3
3
  # Abstract class for pluggable normalization algorithms. Delegates to a default or selected algorithm if instantiated
4
- module Base
4
+ class Base
5
5
  attr_reader :dataset
6
6
 
7
7
  # Enumerates normalized statements
@@ -11,5 +11,12 @@ module RDF::Normalize
11
11
  def each(&block)
12
12
  raise "Not Implemented"
13
13
  end
14
+
15
+ # Returns a map from input blank node identifiers to canonical blank node identifiers.
16
+ #
17
+ # @return [Hash{String => String}]
18
+ def to_hash
19
+ raise "Not Implemented"
20
+ end
14
21
  end
15
22
  end
@@ -1,8 +1,6 @@
1
1
  module RDF::Normalize
2
- class Carroll2001
2
+ class Carroll2001 < Base
3
3
  include RDF::Enumerable
4
- include Base
5
- include Utils
6
4
 
7
5
  ##
8
6
  # Create an enumerable with grounded nodes
@@ -2,7 +2,18 @@ require 'rdf/nquads'
2
2
 
3
3
  module RDF::Normalize
4
4
  class Format < RDF::Format
5
- content_type 'application/normalized+n-quads', alias: 'application/x-normalized+n-quads'
5
+ content_type 'application/canonical+n-quads', alias: 'application/x-canonical+n-quads'
6
+ content_encoding 'utf-8'
7
+
8
+ # It reads like normal N-Quads
9
+ reader { RDF::NQuads::Reader}
10
+ writer { RDF::Normalize::Writer }
11
+ end
12
+ end
13
+
14
+ module RDF::Canonicalize
15
+ class Format < RDF::Format
16
+ content_type 'application/canonical+n-quads', alias: 'application/x-canonical+n-quads'
6
17
  content_encoding 'utf-8'
7
18
 
8
19
  # It reads like normal N-Quads
@@ -6,27 +6,49 @@ rescue LoadError
6
6
  end
7
7
 
8
8
  module RDF::Normalize
9
- class RDFC10
9
+ class RDFC10 < Base
10
10
  include RDF::Enumerable
11
11
  include RDF::Util::Logger
12
- include Base
13
12
 
14
13
  ##
15
14
  # Create an enumerable with grounded nodes
16
15
  #
17
16
  # @param [RDF::Enumerable] enumerable
17
+ # @option options [Integer] :max_calls (40)
18
+ # Maximum number of calls allowed for recursive blank node labeling,
19
+ # as a multiple of the total number of blank nodes in the dataset.
20
+ # @options options [:MD5, :SHA1, :SHA2, :SHA256, :SHA384, :SHA512] :hash_algorithm (:SHA256)
21
+ # See [Digest Algorithms](https://github.com/ruby/digest#digest-algorithms)
18
22
  # @return [RDF::Enumerable]
23
+ # raise [RuntimeError] if the maximum number of levels of recursion is exceeded.
19
24
  def initialize(enumerable, **options)
20
25
  @dataset, @options = enumerable, options
26
+ @options[:hash_algorithm] ||= :SHA256
27
+ unless %i{MD5 SHA1 SHA2 SHA256 SHA384 SHA512}.include?(@options[:hash_algorithm])
28
+ raise UnknownHashAlgorithm, "UnknownHashAlgorithm: #{@options[:hash_algorithm].inspect}. Use one of MD5, SHA1, SHA2, SHA256, SHA384, or SHA512"
29
+ end
21
30
  end
22
31
 
32
+ # Yields each normalized statement
23
33
  def each(&block)
24
- ns = NormalizationState.new(@options)
34
+ ns = NormalizationState.new(**@options)
25
35
  log_debug("ca:")
26
36
  log_debug(" log point", "Entering the canonicalization function (4.5.3).")
27
37
  log_depth(depth: 2) {normalize_statements(ns, &block)}
28
38
  end
29
39
 
40
+ # Returns a map from input blank node identifiers to canonical blank node identifiers.
41
+ #
42
+ # @return [Hash{String => String}]
43
+ def to_hash
44
+ ns = NormalizationState.new(**@options)
45
+ log_debug("ca:")
46
+ log_debug(" log point", "Entering the canonicalization function (4.5.3).")
47
+ log_depth(depth: 2) {normalize_statements(ns)}
48
+ ns.canonical_issuer.to_hash
49
+ end
50
+
51
+ #
30
52
  protected
31
53
  def normalize_statements(ns, &block)
32
54
  # Step 2: Map BNodes to the statements they are used by
@@ -79,6 +101,11 @@ module RDF::Normalize
79
101
  log_debug("ca.5:") unless ns.hash_to_bnodes.empty?
80
102
  log_debug(" log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5)).")
81
103
  log_debug(" with:") unless ns.hash_to_bnodes.empty?
104
+
105
+ # Initialize the number of calls allowed to hash_n_degree_quads
106
+ # as a multiple of the total number of blank nodes in the dataset.
107
+ ns.max_calls = ns.bnode_to_statements.keys.length * @options.fetch(:max_calls, 40)
108
+
82
109
  ns.hash_to_bnodes.keys.sort.each do |hash|
83
110
  identifier_list = ns.hash_to_bnodes[hash]
84
111
 
@@ -105,27 +132,29 @@ module RDF::Normalize
105
132
  hash_path_list.sort_by(&:first).each do |result, issuer|
106
133
  issuer.issued.each do |node|
107
134
  id = ns.canonical_issuer.issue_identifier(node)
108
- log_debug(" - blank node") {node.id}
109
- log_debug(" canonical identifier", id)
135
+ log_debug(" - blank node") {node.id}
136
+ log_debug(" canonical identifier", id)
110
137
  end
111
138
  end
112
139
  end
113
140
 
114
141
  # Step 6: Yield statements using BNodes from canonical replacements
115
- dataset.each_statement do |statement|
116
- if statement.has_blank_nodes?
117
- quad = statement.to_quad.compact.map do |term|
118
- term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
142
+ if block_given?
143
+ dataset.each_statement do |statement|
144
+ if statement.has_blank_nodes?
145
+ quad = statement.to_quad.compact.map do |term|
146
+ term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
147
+ end
148
+ block.call RDF::Statement.from(quad)
149
+ else
150
+ block.call statement
119
151
  end
120
- block.call RDF::Statement.from(quad)
121
- else
122
- block.call statement
123
152
  end
124
153
  end
125
154
 
126
155
  log_debug("ca.6:")
127
- log_debug(" log point", "Replace original with canonical labels (4.5.3 (6)).")
128
- log_debug(" canonical issuer: #{ns.canonical_issuer.inspect}")
156
+ log_debug(" log point", "Issued identifiers map (4.4.3 (6)).")
157
+ log_debug(" issued identifiers map: #{ns.canonical_issuer.inspect}")
129
158
  dataset
130
159
  end
131
160
 
@@ -135,12 +164,17 @@ module RDF::Normalize
135
164
  include RDF::Util::Logger
136
165
 
137
166
  attr_accessor :bnode_to_statements
167
+ attr_accessor :hash_algorithm
138
168
  attr_accessor :hash_to_bnodes
139
169
  attr_accessor :canonical_issuer
170
+ attr_accessor :max_calls
171
+ attr_accessor :total_calls
140
172
 
141
- def initialize(options)
173
+ def initialize(**options)
142
174
  @options = options
175
+ @hash_algorithm = Digest.const_get(options.fetch(:hash_algorithm, :SHA256))
143
176
  @bnode_to_statements, @hash_to_bnodes, @canonical_issuer = {}, {}, IdentifierIssuer.new("c14n")
177
+ @max_calls, @total_calls = nil, 0
144
178
  end
145
179
 
146
180
  def add_statement(node, statement)
@@ -204,34 +238,40 @@ module RDF::Normalize
204
238
  hexdigest(input)
205
239
  end
206
240
 
207
- # @param [RDF::Node] identifier
241
+ # @param [RDF::Node] node
208
242
  # @param [IdentifierIssuer] issuer
209
243
  # @return [Array<String,IdentifierIssuer>] the Hash and issuer
210
- def hash_n_degree_quads(identifier, issuer)
244
+ # @raise [MaxCallsExceeded] If total number of calls has exceeded `max_calls` times the number of blank nodes in the dataset.
245
+ def hash_n_degree_quads(node, issuer)
211
246
  log_debug("hndq:")
212
247
  log_debug(" log point", "Hash N-Degree Quads function (4.9.3).")
213
- log_debug(" identifier") {identifier.id}
248
+ log_debug(" identifier") {node.id}
214
249
  log_debug(" issuer") {issuer.inspect}
215
250
 
251
+ if max_calls && total_calls >= max_calls
252
+ raise MaxCallsExceeded, "Exceeded maximum number of calls (#{total_calls}) allowed to hash_n_degree_quads"
253
+ end
254
+ @total_calls += 1
255
+
216
256
  # hash to related blank nodes map
217
257
  hn = {}
218
258
 
219
259
  log_debug(" hndq.2:")
220
260
  log_debug(" log point", "Quads for identifier (4.9.3 (2)).")
221
261
  log_debug(" quads:")
222
- bnode_to_statements[identifier].each do |s|
262
+ bnode_to_statements[node].each do |s|
223
263
  log_debug {" - #{s.to_nquads.strip}"}
224
264
  end
225
265
 
226
266
  # Step 3
227
267
  log_debug(" hndq.3:")
228
268
  log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (3)).")
229
- log_debug(" with:") unless bnode_to_statements[identifier].empty?
230
- bnode_to_statements[identifier].each do |statement|
269
+ log_debug(" with:") unless bnode_to_statements[node].empty?
270
+ bnode_to_statements[node].each do |statement|
231
271
  log_debug {" - quad: #{statement.to_nquads.strip}"}
232
272
  log_debug(" hndq.3.1:")
233
273
  log_debug(" log point", "Hash related bnode component (4.9.3 (3.1))")
234
- log_depth(depth: 10) {hash_related_statement(identifier, statement, issuer, hn)}
274
+ log_depth(depth: 10) {hash_related_statement(node, statement, issuer, hn)}
235
275
  end
236
276
  log_debug(" Hash to bnodes:")
237
277
  hn.each do |k,v|
@@ -286,7 +326,9 @@ module RDF::Normalize
286
326
  log_debug(" with:") unless recursion_list.empty?
287
327
  recursion_list.each do |related|
288
328
  log_debug(" - related") {related.id}
289
- result = log_depth(depth: 18) {hash_n_degree_quads(related, issuer_copy)}
329
+ result = log_depth(depth: 18) do
330
+ hash_n_degree_quads(related, issuer_copy)
331
+ end
290
332
  path << '_:' + issuer_copy.issue_identifier(related)
291
333
  path << "<#{result.first}>"
292
334
  issuer_copy = result.last
@@ -333,14 +375,14 @@ module RDF::Normalize
333
375
  protected
334
376
 
335
377
  def hexdigest(val)
336
- Digest::SHA256.hexdigest(val)
378
+ hash_algorithm.hexdigest(val)
337
379
  end
338
380
 
339
381
  # Group adjacent bnodes by hash
340
- def hash_related_statement(identifier, statement, issuer, map)
382
+ def hash_related_statement(node, statement, issuer, map)
341
383
  log_debug("with:") if statement.to_h.values.any? {|t| t.is_a?(RDF::Node)}
342
384
  statement.to_h(:s, :p, :o, :g).each do |pos, term|
343
- next if !term.is_a?(RDF::Node) || term == identifier
385
+ next if !term.is_a?(RDF::Node) || term == node
344
386
 
345
387
  log_debug(" - position", pos)
346
388
  hash = log_depth(depth: 4) {hash_related_node(term, statement, issuer, pos)}
@@ -374,6 +416,11 @@ module RDF::Normalize
374
416
  @issued[node]
375
417
  end
376
418
 
419
+ # @return [Hash{Symbol => Symbol}] the issued identifiers map
420
+ def to_hash
421
+ @issued.inject({}) {|memo, (node, canon)| memo.merge(node.id => canon)}
422
+ end
423
+
377
424
  # Duplicate this issuer, ensuring that the issued identifiers remain distinct
378
425
  # @return [IdentifierIssuer]
379
426
  def dup
@@ -2,10 +2,16 @@ module RDF::Normalize
2
2
  class URGNA2012 < RDFC10
3
3
 
4
4
  def each(&block)
5
- ns = NormalizationState.new(@options)
5
+ ns = NormalizationState.new(**@options)
6
6
  normalize_statements(ns, &block)
7
7
  end
8
8
 
9
+ def to_hash
10
+ ns = NormalizationState.new(**@options)
11
+ normalize_statements(ns)
12
+ ns.canonical_issuer.to_h
13
+ end
14
+
9
15
  class NormalizationState < RDFC10::NormalizationState
10
16
  protected
11
17
 
data/lib/rdf/normalize.rb CHANGED
@@ -3,7 +3,7 @@ require 'digest'
3
3
 
4
4
  module RDF
5
5
  ##
6
- # **`RDF::Normalize`** is an RDF Graph normalization plugin for RDF.rb.
6
+ # **`RDF::Normalize`** is an RDF Graph canonicalization plugin for RDF.rb.
7
7
  #
8
8
  # @example Requiring the `RDF::Normalize` module
9
9
  # require 'rdf/normalize'
@@ -18,7 +18,7 @@ module RDF
18
18
  # @example Returning normalized N-Quads
19
19
  #
20
20
  # g = RDF::Graph.load("etc/doap.ttl")
21
- # g.dump(:normalize)
21
+ # g.dump(:normalize) # or :canonicalize
22
22
  #
23
23
  # @example Writing a repository as normalized N-Quads
24
24
  #
@@ -52,6 +52,10 @@ module RDF
52
52
  # @param [Hash{Symbol => Object}] options
53
53
  # @option options [Base] :algorithm (:rdfc10)
54
54
  # One of `:carroll2001`, `:urgna2012`, or `:rdfc10`
55
+ # @option options [Integer] :max_calls
56
+ # Maximum number of calls allowed for recursive blank node labeling,
57
+ # as a multiple of the total number of blank nodes in the dataset.
58
+ # @option options [Boolean] :identifier_map
55
59
  # @return [RDF::Normalize::Base]
56
60
  # @raise [ArgumentError] selected algorithm not defined
57
61
  def new(enumerable, **options)
@@ -62,5 +66,30 @@ module RDF
62
66
  end
63
67
  module_function :new
64
68
 
69
+ class MaxCallsExceeded < RuntimeError; end
70
+ class UnknownHashAlgorithm < RuntimeError; end
71
+ end
72
+
73
+ module Canonicalize
74
+ # RDF::Canonicalize extends RDF::Normalize.
75
+ include Normalize
76
+ end
77
+
78
+ # Change RDF::Enumerable#canonicalize
79
+ module Enumerable
80
+ ##
81
+ # Returns the resulting Enumerable result from RDF::Normalize.
82
+ # This also canonicalizes URIs and Literals.
83
+ #
84
+ # @return [RDF::Enumerable]
85
+ remove_method :canonicalize if method_defined? :canonicalize
86
+ def canonicalize
87
+ # Ensure that statements are queryable, countable and enumerable
88
+ this = self
89
+ enum = Enumerator.new do |yielder|
90
+ this.send(:each_statement) {|y| yielder << y.canonicalize}
91
+ end
92
+ RDF::Normalize.new(enum)
93
+ end
65
94
  end
66
95
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rdf-normalize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregg Kellogg
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-06-10 00:00:00.000000000 Z
11
+ date: 2023-09-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rdf
@@ -16,70 +16,70 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '3.2'
19
+ version: '3.3'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '3.2'
26
+ version: '3.3'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rdf-spec
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '3.2'
33
+ version: '3.3'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '3.2'
40
+ version: '3.3'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '3.10'
47
+ version: '3.12'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '3.10'
54
+ version: '3.12'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: json-ld
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '3.2'
61
+ version: '3.3'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '3.2'
68
+ version: '3.3'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rdf-trig
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '3.2'
75
+ version: '3.3'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '3.2'
82
+ version: '3.3'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: yard
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -129,14 +129,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
129
129
  requirements:
130
130
  - - ">="
131
131
  - !ruby/object:Gem::Version
132
- version: '2.6'
132
+ version: '3.0'
133
133
  required_rubygems_version: !ruby/object:Gem::Requirement
134
134
  requirements:
135
135
  - - ">="
136
136
  - !ruby/object:Gem::Version
137
137
  version: '0'
138
138
  requirements: []
139
- rubygems_version: 3.4.13
139
+ rubygems_version: 3.4.19
140
140
  signing_key:
141
141
  specification_version: 4
142
142
  summary: RDF Graph normalizer for Ruby.