rdf-normalize 0.6.0 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 15478756de443574bde6120436faf09bec1f7e40dcfc60f39fc97af92e686738
4
- data.tar.gz: d5617da52a4d7e3429452f691e4a9ccb7f6ac8bedcef6dd66583b1322e0b57f0
3
+ metadata.gz: 7b8d7e930eb7f452fef42bd6f66b29dfcd3f526f7c9dafebbfe214ed5dfa4007
4
+ data.tar.gz: c4c60292f9868d39d50545dca77c8d5afd85885b0a1fc690b0ce84d2ee00ddd7
5
5
  SHA512:
6
- metadata.gz: 7c2ccd4449f12d5095702d19a8c1d27539aa5afa23c8b96ffcf6f43ee0d6d10fd763e2dbc98f2ef008ede3edc3fda1801eb6a1cd3ad0e80e3b82995017ae93e4
7
- data.tar.gz: f760c7336703292679c82b6abbea86ffe7b8ac1b803508c187d8aee7bcd8cd635d0b039d928b7d145198f7df884027aeb911fa2e97e0e9d171cae92e4d26ed0b
6
+ metadata.gz: 05dd3390670479211a348c4fbd91e0c369355f374d72d2834697641ab64b3cce438b4df8ee63f6db50af1d68e036fa747762474448e6d326d5bfa758ca120b3d
7
+ data.tar.gz: 6189376e59d897e1e6d4b440b6ed75f0e7de970d6cbacb6b21e7e75976e7da0b6e6899085e2972eabf307556d9b17eb5277509fde3096e3ab428a0e84758d9c8
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.6.0
1
+ 0.6.1
@@ -1,7 +1,7 @@
1
1
  module RDF::Normalize
2
2
  ##
3
3
  # Abstract class for pluggable normalization algorithms. Delegates to a default or selected algorithm if instantiated
4
- module Base
4
+ class Base
5
5
  attr_reader :dataset
6
6
 
7
7
  # Enumerates normalized statements
@@ -11,5 +11,12 @@ module RDF::Normalize
11
11
  def each(&block)
12
12
  raise "Not Implemented"
13
13
  end
14
+
15
+ # Returns a map from input blank node identifiers to canonical blank node identifiers.
16
+ #
17
+ # @return [Hash{String => String}]
18
+ def to_hash
19
+ raise "Not Implemented"
20
+ end
14
21
  end
15
22
  end
@@ -1,8 +1,6 @@
1
1
  module RDF::Normalize
2
- class Carroll2001
2
+ class Carroll2001 < Base
3
3
  include RDF::Enumerable
4
- include Base
5
- include Utils
6
4
 
7
5
  ##
8
6
  # Create an enumerable with grounded nodes
@@ -6,27 +6,43 @@ rescue LoadError
6
6
  end
7
7
 
8
8
  module RDF::Normalize
9
- class RDFC10
9
+ class RDFC10 < Base
10
10
  include RDF::Enumerable
11
11
  include RDF::Util::Logger
12
- include Base
13
12
 
14
13
  ##
15
14
  # Create an enumerable with grounded nodes
16
15
  #
17
16
  # @param [RDF::Enumerable] enumerable
17
+ # @option options [Integer] :max_calls (40)
18
+ # Maximum number of calls allowed for recursive blank node labeling,
19
+ # as a multiple of the total number of blank nodes in the dataset.
18
20
  # @return [RDF::Enumerable]
21
+ # raise [RuntimeError] if the maximum number of levels of recursion is exceeded.
19
22
  def initialize(enumerable, **options)
20
23
  @dataset, @options = enumerable, options
21
24
  end
22
25
 
26
+ # Yields each normalized statement
23
27
  def each(&block)
24
- ns = NormalizationState.new(@options)
28
+ ns = NormalizationState.new(**@options)
25
29
  log_debug("ca:")
26
30
  log_debug(" log point", "Entering the canonicalization function (4.5.3).")
27
31
  log_depth(depth: 2) {normalize_statements(ns, &block)}
28
32
  end
29
33
 
34
+ # Returns a map from input blank node identifiers to canonical blank node identifiers.
35
+ #
36
+ # @return [Hash{String => String}]
37
+ def to_hash
38
+ ns = NormalizationState.new(**@options)
39
+ log_debug("ca:")
40
+ log_debug(" log point", "Entering the canonicalization function (4.5.3).")
41
+ log_depth(depth: 2) {normalize_statements(ns)}
42
+ ns.canonical_issuer.to_hash
43
+ end
44
+
45
+ #
30
46
  protected
31
47
  def normalize_statements(ns, &block)
32
48
  # Step 2: Map BNodes to the statements they are used by
@@ -79,6 +95,11 @@ module RDF::Normalize
79
95
  log_debug("ca.5:") unless ns.hash_to_bnodes.empty?
80
96
  log_debug(" log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5)).")
81
97
  log_debug(" with:") unless ns.hash_to_bnodes.empty?
98
+
99
+ # Initialize the number of calls allowed to hash_n_degree_quads
100
+ # as a multiple of the total number of blank nodes in the dataset.
101
+ ns.max_calls = ns.bnode_to_statements.keys.length * @options.fetch(:max_calls, 40)
102
+
82
103
  ns.hash_to_bnodes.keys.sort.each do |hash|
83
104
  identifier_list = ns.hash_to_bnodes[hash]
84
105
 
@@ -105,27 +126,29 @@ module RDF::Normalize
105
126
  hash_path_list.sort_by(&:first).each do |result, issuer|
106
127
  issuer.issued.each do |node|
107
128
  id = ns.canonical_issuer.issue_identifier(node)
108
- log_debug(" - blank node") {node.id}
109
- log_debug(" canonical identifier", id)
129
+ log_debug(" - blank node") {node.id}
130
+ log_debug(" canonical identifier", id)
110
131
  end
111
132
  end
112
133
  end
113
134
 
114
135
  # Step 6: Yield statements using BNodes from canonical replacements
115
- dataset.each_statement do |statement|
116
- if statement.has_blank_nodes?
117
- quad = statement.to_quad.compact.map do |term|
118
- term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
136
+ if block_given?
137
+ dataset.each_statement do |statement|
138
+ if statement.has_blank_nodes?
139
+ quad = statement.to_quad.compact.map do |term|
140
+ term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
141
+ end
142
+ block.call RDF::Statement.from(quad)
143
+ else
144
+ block.call statement
119
145
  end
120
- block.call RDF::Statement.from(quad)
121
- else
122
- block.call statement
123
146
  end
124
147
  end
125
148
 
126
149
  log_debug("ca.6:")
127
- log_debug(" log point", "Replace original with canonical labels (4.5.3 (6)).")
128
- log_debug(" canonical issuer: #{ns.canonical_issuer.inspect}")
150
+ log_debug(" log point", "Issued identifiers map (4.4.3 (6)).")
151
+ log_debug(" issued identifiers map: #{ns.canonical_issuer.inspect}")
129
152
  dataset
130
153
  end
131
154
 
@@ -137,10 +160,13 @@ module RDF::Normalize
137
160
  attr_accessor :bnode_to_statements
138
161
  attr_accessor :hash_to_bnodes
139
162
  attr_accessor :canonical_issuer
163
+ attr_accessor :max_calls
164
+ attr_accessor :total_calls
140
165
 
141
- def initialize(options)
166
+ def initialize(**options)
142
167
  @options = options
143
168
  @bnode_to_statements, @hash_to_bnodes, @canonical_issuer = {}, {}, IdentifierIssuer.new("c14n")
169
+ @max_calls, @total_calls = nil, 0
144
170
  end
145
171
 
146
172
  def add_statement(node, statement)
@@ -204,34 +230,40 @@ module RDF::Normalize
204
230
  hexdigest(input)
205
231
  end
206
232
 
207
- # @param [RDF::Node] identifier
233
+ # @param [RDF::Node] node
208
234
  # @param [IdentifierIssuer] issuer
209
235
  # @return [Array<String,IdentifierIssuer>] the Hash and issuer
210
- def hash_n_degree_quads(identifier, issuer)
236
+ # @raise [RuntimeError] If total number of calls has exceeded `max_calls` times the number of blank nodes in the dataset.
237
+ def hash_n_degree_quads(node, issuer)
211
238
  log_debug("hndq:")
212
239
  log_debug(" log point", "Hash N-Degree Quads function (4.9.3).")
213
- log_debug(" identifier") {identifier.id}
240
+ log_debug(" identifier") {node.id}
214
241
  log_debug(" issuer") {issuer.inspect}
215
242
 
243
+ if max_calls && total_calls >= max_calls
244
+ raise "Exceeded maximum number of calls (#{total_calls}) allowed to hash_n_degree_quads"
245
+ end
246
+ @total_calls += 1
247
+
216
248
  # hash to related blank nodes map
217
249
  hn = {}
218
250
 
219
251
  log_debug(" hndq.2:")
220
252
  log_debug(" log point", "Quads for identifier (4.9.3 (2)).")
221
253
  log_debug(" quads:")
222
- bnode_to_statements[identifier].each do |s|
254
+ bnode_to_statements[node].each do |s|
223
255
  log_debug {" - #{s.to_nquads.strip}"}
224
256
  end
225
257
 
226
258
  # Step 3
227
259
  log_debug(" hndq.3:")
228
260
  log_debug(" log point", "Hash N-Degree Quads function (4.9.3 (3)).")
229
- log_debug(" with:") unless bnode_to_statements[identifier].empty?
230
- bnode_to_statements[identifier].each do |statement|
261
+ log_debug(" with:") unless bnode_to_statements[node].empty?
262
+ bnode_to_statements[node].each do |statement|
231
263
  log_debug {" - quad: #{statement.to_nquads.strip}"}
232
264
  log_debug(" hndq.3.1:")
233
265
  log_debug(" log point", "Hash related bnode component (4.9.3 (3.1))")
234
- log_depth(depth: 10) {hash_related_statement(identifier, statement, issuer, hn)}
266
+ log_depth(depth: 10) {hash_related_statement(node, statement, issuer, hn)}
235
267
  end
236
268
  log_debug(" Hash to bnodes:")
237
269
  hn.each do |k,v|
@@ -286,7 +318,9 @@ module RDF::Normalize
286
318
  log_debug(" with:") unless recursion_list.empty?
287
319
  recursion_list.each do |related|
288
320
  log_debug(" - related") {related.id}
289
- result = log_depth(depth: 18) {hash_n_degree_quads(related, issuer_copy)}
321
+ result = log_depth(depth: 18) do
322
+ hash_n_degree_quads(related, issuer_copy)
323
+ end
290
324
  path << '_:' + issuer_copy.issue_identifier(related)
291
325
  path << "<#{result.first}>"
292
326
  issuer_copy = result.last
@@ -337,10 +371,10 @@ module RDF::Normalize
337
371
  end
338
372
 
339
373
  # Group adjacent bnodes by hash
340
- def hash_related_statement(identifier, statement, issuer, map)
374
+ def hash_related_statement(node, statement, issuer, map)
341
375
  log_debug("with:") if statement.to_h.values.any? {|t| t.is_a?(RDF::Node)}
342
376
  statement.to_h(:s, :p, :o, :g).each do |pos, term|
343
- next if !term.is_a?(RDF::Node) || term == identifier
377
+ next if !term.is_a?(RDF::Node) || term == node
344
378
 
345
379
  log_debug(" - position", pos)
346
380
  hash = log_depth(depth: 4) {hash_related_node(term, statement, issuer, pos)}
@@ -374,6 +408,11 @@ module RDF::Normalize
374
408
  @issued[node]
375
409
  end
376
410
 
411
+ # @return [Hash{Symbol => Symbol}] the issued identifiers map
412
+ def to_hash
413
+ @issued.inject({}) {|memo, (node, canon)| memo.merge(node.id => canon)}
414
+ end
415
+
377
416
  # Duplicate this issuer, ensuring that the issued identifiers remain distinct
378
417
  # @return [IdentifierIssuer]
379
418
  def dup
@@ -2,10 +2,16 @@ module RDF::Normalize
2
2
  class URGNA2012 < RDFC10
3
3
 
4
4
  def each(&block)
5
- ns = NormalizationState.new(@options)
5
+ ns = NormalizationState.new(**@options)
6
6
  normalize_statements(ns, &block)
7
7
  end
8
8
 
9
+ def to_hash
10
+ ns = NormalizationState.new(**@options)
11
+ normalize_statements(ns)
12
+ ns.canonical_issuer.to_h
13
+ end
14
+
9
15
  class NormalizationState < RDFC10::NormalizationState
10
16
  protected
11
17
 
data/lib/rdf/normalize.rb CHANGED
@@ -52,6 +52,10 @@ module RDF
52
52
  # @param [Hash{Symbol => Object}] options
53
53
  # @option options [Base] :algorithm (:rdfc10)
54
54
  # One of `:carroll2001`, `:urgna2012`, or `:rdfc10`
55
+ # @option options [Integer] :max_calls
56
+ # Maximum number of calls allowed for recursive blank node labeling,
57
+ # as a multiple of the total number of blank nodes in the dataset.
58
+ # @option options [Boolean] :identifier_map
55
59
  # @return [RDF::Normalize::Base]
56
60
  # @raise [ArgumentError] selected algorithm not defined
57
61
  def new(enumerable, **options)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rdf-normalize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregg Kellogg
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-06-10 00:00:00.000000000 Z
11
+ date: 2023-07-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rdf