rdf-normalize 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 15478756de443574bde6120436faf09bec1f7e40dcfc60f39fc97af92e686738
4
- data.tar.gz: d5617da52a4d7e3429452f691e4a9ccb7f6ac8bedcef6dd66583b1322e0b57f0
3
+ metadata.gz: 7b8d7e930eb7f452fef42bd6f66b29dfcd3f526f7c9dafebbfe214ed5dfa4007
4
+ data.tar.gz: c4c60292f9868d39d50545dca77c8d5afd85885b0a1fc690b0ce84d2ee00ddd7
5
5
  SHA512:
6
- metadata.gz: 7c2ccd4449f12d5095702d19a8c1d27539aa5afa23c8b96ffcf6f43ee0d6d10fd763e2dbc98f2ef008ede3edc3fda1801eb6a1cd3ad0e80e3b82995017ae93e4
7
- data.tar.gz: f760c7336703292679c82b6abbea86ffe7b8ac1b803508c187d8aee7bcd8cd635d0b039d928b7d145198f7df884027aeb911fa2e97e0e9d171cae92e4d26ed0b
6
+ metadata.gz: 05dd3390670479211a348c4fbd91e0c369355f374d72d2834697641ab64b3cce438b4df8ee63f6db50af1d68e036fa747762474448e6d326d5bfa758ca120b3d
7
+ data.tar.gz: 6189376e59d897e1e6d4b440b6ed75f0e7de970d6cbacb6b21e7e75976e7da0b6e6899085e2972eabf307556d9b17eb5277509fde3096e3ab428a0e84758d9c8
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.6.0
1
+ 0.6.1
@@ -1,7 +1,7 @@
1
1
  module RDF::Normalize
2
2
  ##
3
3
  # Abstract class for pluggable normalization algorithms. Delegates to a default or selected algorithm if instantiated
4
- module Base
4
+ class Base
5
5
  attr_reader :dataset
6
6
 
7
7
  # Enumerates normalized statements
@@ -11,5 +11,12 @@ module RDF::Normalize
11
11
  def each(&block)
12
12
  raise "Not Implemented"
13
13
  end
14
+
15
+ # Returns a map from input blank node identifiers to canonical blank node identifiers.
16
+ #
17
+ # @return [Hash{String => String}]
18
+ def to_hash
19
+ raise "Not Implemented"
20
+ end
14
21
  end
15
22
  end
@@ -1,8 +1,6 @@
1
1
  module RDF::Normalize
2
- class Carroll2001
2
+ class Carroll2001 < Base
3
3
  include RDF::Enumerable
4
- include Base
5
- include Utils
6
4
 
7
5
  ##
8
6
  # Create an enumerable with grounded nodes
@@ -6,27 +6,43 @@ rescue LoadError
6
6
  end
7
7
 
8
8
  module RDF::Normalize
9
- class RDFC10
9
+ class RDFC10 < Base
10
10
  include RDF::Enumerable
11
11
  include RDF::Util::Logger
12
- include Base
13
12
 
14
13
  ##
15
14
  # Create an enumerable with grounded nodes
16
15
  #
17
16
  # @param [RDF::Enumerable] enumerable
17
+ # @option options [Integer] :max_calls (40)
18
+ # Maximum number of calls allowed for recursive blank node labeling,
19
+ # as a multiple of the total number of blank nodes in the dataset.
18
20
  # @return [RDF::Enumerable]
21
+ # raise [RuntimeError] if the maximum number of levels of recursion is exceeded.
19
22
  def initialize(enumerable, **options)
20
23
  @dataset, @options = enumerable, options
21
24
  end
22
25
 
26
+ # Yields each normalized statement
23
27
  def each(&block)
24
- ns = NormalizationState.new(@options)
28
+ ns = NormalizationState.new(**@options)
25
29
  log_debug("ca:")
26
30
  log_debug(" log point", "Entering the canonicalization function (4.5.3).")
27
31
  log_depth(depth: 2) {normalize_statements(ns, &block)}
28
32
  end
29
33
 
34
+ # Returns a map from input blank node identifiers to canonical blank node identifiers.
35
+ #
36
+ # @return [Hash{String => String}]
37
+ def to_hash
38
+ ns = NormalizationState.new(**@options)
39
+ log_debug("ca:")
40
+ log_debug(" log point", "Entering the canonicalization function (4.5.3).")
41
+ log_depth(depth: 2) {normalize_statements(ns)}
42
+ ns.canonical_issuer.to_hash
43
+ end
44
+
45
+ #
30
46
  protected
31
47
  def normalize_statements(ns, &block)
32
48
  # Step 2: Map BNodes to the statements they are used by
@@ -79,6 +95,11 @@ module RDF::Normalize
79
95
  log_debug("ca.5:") unless ns.hash_to_bnodes.empty?
80
96
  log_debug(" log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5)).")
81
97
  log_debug(" with:") unless ns.hash_to_bnodes.empty?
98
+
99
+ # Initialize the number of calls allowed to hash_n_degree_quads
100
+ # as a multiple of the total number of blank nodes in the dataset.
101
+ ns.max_calls = ns.bnode_to_statements.keys.length * @options.fetch(:max_calls, 40)
102
+
82
103
  ns.hash_to_bnodes.keys.sort.each do |hash|
83
104
  identifier_list = ns.hash_to_bnodes[hash]
84
105
 
@@ -105,27 +126,29 @@ module RDF::Normalize
105
126
  hash_path_list.sort_by(&:first).each do |result, issuer|
106
127
  issuer.issued.each do |node|
107
128
  id = ns.canonical_issuer.issue_identifier(node)
108
- log_debug(" - blank node") {node.id}
109
- log_debug(" canonical identifier", id)
129
+ log_debug(" - blank node") {node.id}
130
+ log_debug(" canonical identifier", id)
110
131
  end
111
132
  end
112
133
  end
113
134
 
114
135
  # Step 6: Yield statements using BNodes from canonical replacements
115
- dataset.each_statement do |statement|
116
- if statement.has_blank_nodes?
117
- quad = statement.to_quad.compact.map do |term|
118
- term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
136
+ if block_given?
137
+ dataset.each_statement do |statement|
138
+ if statement.has_blank_nodes?
139
+ quad = statement.to_quad.compact.map do |term|
140
+ term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
141
+ end
142
+ block.call RDF::Statement.from(quad)
143
+ else
144
+ block.call statement
119
145
  end
120
- block.call RDF::Statement.from(quad)
121
- else
122
- block.call statement
123
146
  end
124
147
  end
125
148
 
126
149
  log_debug("ca.6:")
127
- log_debug(" log point", "Replace original with canonical labels (4.5.3 (6)).")
128
- log_debug(" canonical issuer: #{ns.canonical_issuer.inspect}")
150
+ log_debug(" log point", "Issued identifiers map (4.4.3 (6)).")
151
+ log_debug(" issued identifiers map: #{ns.canonical_issuer.inspect}")
129
152
  dataset
130
153
  end
131
154
 
@@ -137,10 +160,13 @@ module RDF::Normalize
137
160
  attr_accessor :bnode_to_statements
138
161
  attr_accessor :hash_to_bnodes
139
162
  attr_accessor :canonical_issuer
163
+ attr_accessor :max_calls
164
+ attr_accessor :total_calls
140
165
 
141
- def initialize(options)
166
+ def initialize(**options)
142
167
  @options = options
143
168
  @bnode_to_statements, @hash_to_bnodes, @canonical_issuer = {}, {}, IdentifierIssuer.new("c14n")
169
+ @max_calls, @total_calls = nil, 0
144
170
  end
145
171
 
146
172
  def add_statement(node, statement)
@@ -204,34 +230,40 @@ module RDF::Normalize
204
230
  hexdigest(input)
205
231
  end
206
232
 
207
- # @param [RDF::Node] identifier
233
+ # @param [RDF::Node] node
208
234
  # @param [IdentifierIssuer] issuer
209
235
  # @return [Array<String,IdentifierIssuer>] the Hash and issuer
210
- def hash_n_degree_quads(identifier, issuer)
236
+ # @raise [RuntimeError] If total number of calls has exceeded `max_calls` times the number of blank nodes in the dataset.
237
+ def hash_n_degree_quads(node, issuer)
211
238
  log_debug("hndq:")
212
239
  log_debug(" log point", "Hash N-Degree Quads function (4.9.3).")
213
- log_debug(" identifier") {identifier.id}
240
+ log_debug(" identifier") {node.id}
214
241
  log_debug(" issuer") {issuer.inspect}
215
242
 
243
+ if max_calls && total_calls >= max_calls
244
+ raise "Exceeded maximum number of calls (#{total_calls}) allowed to hash_n_degree_quads"
245
+ end
246
+ @total_calls += 1
247
+
216
248
  # hash to related blank nodes map
217
249
  hn = {}
218
250
 
219
251
  log_debug(" hndq.2:")
220
252
  log_debug(" log point", "Quads for identifier (4.9.3 (2)).")
221
253
  log_debug(" quads:")
222
- bnode_to_statements[identifier].each do |s|
254
+ bnode_to_statements[node].each do |s|
223
255
  log_debug {" - #{s.to_nquads.strip}"}
224
256
  end
225
257
 
226
258
  # Step 3
227
259
  log_debug(" hndq.3:")
228
260
  log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (3)).")
229
- log_debug(" with:") unless bnode_to_statements[identifier].empty?
230
- bnode_to_statements[identifier].each do |statement|
261
+ log_debug(" with:") unless bnode_to_statements[node].empty?
262
+ bnode_to_statements[node].each do |statement|
231
263
  log_debug {" - quad: #{statement.to_nquads.strip}"}
232
264
  log_debug(" hndq.3.1:")
233
265
  log_debug(" log point", "Hash related bnode component (4.9.3 (3.1))")
234
- log_depth(depth: 10) {hash_related_statement(identifier, statement, issuer, hn)}
266
+ log_depth(depth: 10) {hash_related_statement(node, statement, issuer, hn)}
235
267
  end
236
268
  log_debug(" Hash to bnodes:")
237
269
  hn.each do |k,v|
@@ -286,7 +318,9 @@ module RDF::Normalize
286
318
  log_debug(" with:") unless recursion_list.empty?
287
319
  recursion_list.each do |related|
288
320
  log_debug(" - related") {related.id}
289
- result = log_depth(depth: 18) {hash_n_degree_quads(related, issuer_copy)}
321
+ result = log_depth(depth: 18) do
322
+ hash_n_degree_quads(related, issuer_copy)
323
+ end
290
324
  path << '_:' + issuer_copy.issue_identifier(related)
291
325
  path << "<#{result.first}>"
292
326
  issuer_copy = result.last
@@ -337,10 +371,10 @@ module RDF::Normalize
337
371
  end
338
372
 
339
373
  # Group adjacent bnodes by hash
340
- def hash_related_statement(identifier, statement, issuer, map)
374
+ def hash_related_statement(node, statement, issuer, map)
341
375
  log_debug("with:") if statement.to_h.values.any? {|t| t.is_a?(RDF::Node)}
342
376
  statement.to_h(:s, :p, :o, :g).each do |pos, term|
343
- next if !term.is_a?(RDF::Node) || term == identifier
377
+ next if !term.is_a?(RDF::Node) || term == node
344
378
 
345
379
  log_debug(" - position", pos)
346
380
  hash = log_depth(depth: 4) {hash_related_node(term, statement, issuer, pos)}
@@ -374,6 +408,11 @@ module RDF::Normalize
374
408
  @issued[node]
375
409
  end
376
410
 
411
+ # @return [Hash{Symbol => Symbol}] the issued identifiers map
412
+ def to_hash
413
+ @issued.inject({}) {|memo, (node, canon)| memo.merge(node.id => canon)}
414
+ end
415
+
377
416
  # Duplicate this issuer, ensuring that the issued identifiers remain distinct
378
417
  # @return [IdentifierIssuer]
379
418
  def dup
@@ -2,10 +2,16 @@ module RDF::Normalize
2
2
  class URGNA2012 < RDFC10
3
3
 
4
4
  def each(&block)
5
- ns = NormalizationState.new(@options)
5
+ ns = NormalizationState.new(**@options)
6
6
  normalize_statements(ns, &block)
7
7
  end
8
8
 
9
+ def to_hash
10
+ ns = NormalizationState.new(**@options)
11
+ normalize_statements(ns)
12
+ ns.canonical_issuer.to_h
13
+ end
14
+
9
15
  class NormalizationState < RDFC10::NormalizationState
10
16
  protected
11
17
 
data/lib/rdf/normalize.rb CHANGED
@@ -52,6 +52,10 @@ module RDF
52
52
  # @param [Hash{Symbol => Object}] options
53
53
  # @option options [Base] :algorithm (:rdfc10)
54
54
  # One of `:carroll2001`, `:urgna2012`, or `:rdfc10`
55
+ # @option options [Integer] :max_calls
56
+ # Maximum number of calls allowed for recursive blank node labeling,
57
+ # as a multiple of the total number of blank nodes in the dataset.
58
+ # @option options [Boolean] :identifier_map
55
59
  # @return [RDF::Normalize::Base]
56
60
  # @raise [ArgumentError] selected algorithm not defined
57
61
  def new(enumerable, **options)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rdf-normalize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregg Kellogg
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-06-10 00:00:00.000000000 Z
11
+ date: 2023-07-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rdf