RubyGems - rdf-normalize - Versions diffs - 0.6.0 → 0.6.1 - Mend

rdf-normalize 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/VERSION +1 -1
data/lib/rdf/normalize/base.rb +8 -1
data/lib/rdf/normalize/carroll2001.rb +1 -3
data/lib/rdf/normalize/rdfc10.rb +64 -25
data/lib/rdf/normalize/urgna2012.rb +7 -1
data/lib/rdf/normalize.rb +4 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 15478756de443574bde6120436faf09bec1f7e40dcfc60f39fc97af92e686738
-  data.tar.gz: d5617da52a4d7e3429452f691e4a9ccb7f6ac8bedcef6dd66583b1322e0b57f0
+  metadata.gz: 7b8d7e930eb7f452fef42bd6f66b29dfcd3f526f7c9dafebbfe214ed5dfa4007
+  data.tar.gz: c4c60292f9868d39d50545dca77c8d5afd85885b0a1fc690b0ce84d2ee00ddd7
 SHA512:
-  metadata.gz: 7c2ccd4449f12d5095702d19a8c1d27539aa5afa23c8b96ffcf6f43ee0d6d10fd763e2dbc98f2ef008ede3edc3fda1801eb6a1cd3ad0e80e3b82995017ae93e4
-  data.tar.gz: f760c7336703292679c82b6abbea86ffe7b8ac1b803508c187d8aee7bcd8cd635d0b039d928b7d145198f7df884027aeb911fa2e97e0e9d171cae92e4d26ed0b
+  metadata.gz: 05dd3390670479211a348c4fbd91e0c369355f374d72d2834697641ab64b3cce438b4df8ee63f6db50af1d68e036fa747762474448e6d326d5bfa758ca120b3d
+  data.tar.gz: 6189376e59d897e1e6d4b440b6ed75f0e7de970d6cbacb6b21e7e75976e7da0b6e6899085e2972eabf307556d9b17eb5277509fde3096e3ab428a0e84758d9c8

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.6.0
1	+ 0.6.1

data/lib/rdf/normalize/base.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module RDF::Normalize
   ##
   # Abstract class for pluggable normalization algorithms. Delegates to a default or selected algorithm if instantiated
-  module Base
+  class Base
     attr_reader :dataset
     # Enumerates normalized statements
@@ -11,5 +11,12 @@ module RDF::Normalize
     def each(&block)
       raise "Not Implemented"
     end
+    # Returns a map from input blank node identifiers to canonical blank node identifiers.
+    #
+    # @return [Hash{String => String}]
+    def to_hash
+      raise "Not Implemented"
+    end
   end
 end

data/lib/rdf/normalize/carroll2001.rb CHANGED Viewed

@@ -1,8 +1,6 @@
 module RDF::Normalize
-  class Carroll2001
+  class Carroll2001 < Base
     include RDF::Enumerable
-    include Base
-    include Utils
     ##
     # Create an enumerable with grounded nodes

data/lib/rdf/normalize/rdfc10.rb CHANGED Viewed

@@ -6,27 +6,43 @@ rescue LoadError
 end
 module RDF::Normalize
-  class RDFC10
+  class RDFC10 < Base
     include RDF::Enumerable
     include RDF::Util::Logger
-    include Base
     ##
     # Create an enumerable with grounded nodes
     #
     # @param [RDF::Enumerable] enumerable
+    # @option options [Integer] :max_calls (40)
+    #   Maximum number of calls allowed for recursive blank node labeling,
+    #   as a multiple of the total number of blank nodes in the dataset.
     # @return [RDF::Enumerable]
+    # raise [RuntimeError] if the maximum number of levels of recursion is exceeded.
     def initialize(enumerable, **options)
       @dataset, @options = enumerable, options
     end
+    # Yields each normalized statement
     def each(&block)
-      ns = NormalizationState.new(@options)
+      ns = NormalizationState.new(**@options)
       log_debug("ca:")
       log_debug("  log point", "Entering the canonicalization function (4.5.3).")
       log_depth(depth: 2) {normalize_statements(ns, &block)}
     end
+    # Returns a map from input blank node identifiers to canonical blank node identifiers.
+    #
+    # @return [Hash{String => String}]
+    def to_hash
+      ns = NormalizationState.new(**@options)
+      log_debug("ca:")
+      log_debug("  log point", "Entering the canonicalization function (4.5.3).")
+      log_depth(depth: 2) {normalize_statements(ns)}
+      ns.canonical_issuer.to_hash
+    end
+    #
     protected
     def normalize_statements(ns, &block)
       # Step 2: Map BNodes to the statements they are used by
@@ -79,6 +95,11 @@ module RDF::Normalize
       log_debug("ca.5:") unless ns.hash_to_bnodes.empty?
       log_debug("  log point", "Calculate hashes for identifiers with shared hashes (4.5.3 (5)).")
       log_debug("  with:") unless ns.hash_to_bnodes.empty?
+      # Initialize the number of calls allowed to hash_n_degree_quads
+      # as a multiple of the total number of blank nodes in the dataset.
+      ns.max_calls = ns.bnode_to_statements.keys.length * @options.fetch(:max_calls, 40)
       ns.hash_to_bnodes.keys.sort.each do |hash|
         identifier_list = ns.hash_to_bnodes[hash]
@@ -105,27 +126,29 @@ module RDF::Normalize
         hash_path_list.sort_by(&:first).each do |result, issuer|
           issuer.issued.each do |node|
             id = ns.canonical_issuer.issue_identifier(node)
-            log_debug("            - blank node") {node.id}
-            log_debug("              canonical identifier", id)
+            log_debug("          - blank node") {node.id}
+            log_debug("            canonical identifier", id)
           end
         end
       end
       # Step 6: Yield statements using BNodes from canonical replacements
-      dataset.each_statement do |statement|
-        if statement.has_blank_nodes?
-          quad = statement.to_quad.compact.map do |term|
-            term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
+      if block_given?
+        dataset.each_statement do |statement|
+          if statement.has_blank_nodes?
+            quad = statement.to_quad.compact.map do |term|
+              term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)) : term
+            end
+            block.call RDF::Statement.from(quad)
+          else
+            block.call statement
           end
-          block.call RDF::Statement.from(quad)
-        else
-          block.call statement
         end
       end
       log_debug("ca.6:")
-      log_debug("  log point", "Replace original with canonical labels (4.5.3 (6)).")
-      log_debug("  canonical issuer: #{ns.canonical_issuer.inspect}")
+      log_debug("  log point", "Issued identifiers map (4.4.3 (6)).")
+      log_debug("  issued identifiers map: #{ns.canonical_issuer.inspect}")
       dataset
     end
@@ -137,10 +160,13 @@ module RDF::Normalize
       attr_accessor :bnode_to_statements
       attr_accessor :hash_to_bnodes
       attr_accessor :canonical_issuer
+      attr_accessor :max_calls
+      attr_accessor :total_calls
-      def initialize(options)
+      def initialize(**options)
         @options = options
         @bnode_to_statements, @hash_to_bnodes, @canonical_issuer = {}, {}, IdentifierIssuer.new("c14n")
+        @max_calls, @total_calls = nil, 0
       end
       def add_statement(node, statement)
@@ -204,34 +230,40 @@ module RDF::Normalize
         hexdigest(input)
       end
-      # @param [RDF::Node] identifier
+      # @param [RDF::Node] node
       # @param [IdentifierIssuer] issuer
       # @return [Array<String,IdentifierIssuer>] the Hash and issuer
-      def hash_n_degree_quads(identifier, issuer)
+      # @raise [RuntimeError] If total number of calls has exceeded `max_calls` times the number of blank nodes in the dataset.
+      def hash_n_degree_quads(node, issuer)
         log_debug("hndq:")
         log_debug("  log point", "Hash N-Degree Quads function (4.9.3).")
-        log_debug("  identifier") {identifier.id}
+        log_debug("  identifier") {node.id}
         log_debug("  issuer") {issuer.inspect}
+        if max_calls && total_calls >= max_calls
+          raise "Exceeded maximum number of calls (#{total_calls}) allowed to hash_n_degree_quads"
+        end
+        @total_calls += 1
         # hash to related blank nodes map
         hn = {}
         log_debug("  hndq.2:")
         log_debug("    log point", "Quads for identifier (4.9.3 (2)).")
         log_debug("    quads:")
-        bnode_to_statements[identifier].each do |s|
+        bnode_to_statements[node].each do |s|
           log_debug {"    - #{s.to_nquads.strip}"}
         end
         # Step 3
         log_debug("  hndq.3:")
         log_debug("    log point", "Hash N-Degree Quads function (4.9.3 (3)).")
-        log_debug("    with:") unless bnode_to_statements[identifier].empty?
-        bnode_to_statements[identifier].each do |statement|
+        log_debug("    with:") unless bnode_to_statements[node].empty?
+        bnode_to_statements[node].each do |statement|
           log_debug {"      - quad: #{statement.to_nquads.strip}"}
           log_debug("        hndq.3.1:")
           log_debug("          log point", "Hash related bnode component (4.9.3 (3.1))")
-          log_depth(depth: 10) {hash_related_statement(identifier, statement, issuer, hn)}
+          log_depth(depth: 10) {hash_related_statement(node, statement, issuer, hn)}
         end
         log_debug("    Hash to bnodes:")
         hn.each do |k,v|
@@ -286,7 +318,9 @@ module RDF::Normalize
             log_debug("              with:") unless recursion_list.empty?
             recursion_list.each do |related|
               log_debug("                - related") {related.id}
-              result = log_depth(depth: 18) {hash_n_degree_quads(related, issuer_copy)}
+              result = log_depth(depth: 18) do
+                hash_n_degree_quads(related, issuer_copy)
+              end
               path << '_:' + issuer_copy.issue_identifier(related)
               path << "<#{result.first}>"
               issuer_copy = result.last
@@ -337,10 +371,10 @@ module RDF::Normalize
       end
       # Group adjacent bnodes by hash
-      def hash_related_statement(identifier, statement, issuer, map)
+      def hash_related_statement(node, statement, issuer, map)
         log_debug("with:") if statement.to_h.values.any? {|t| t.is_a?(RDF::Node)}
         statement.to_h(:s, :p, :o, :g).each do |pos, term|
-          next if !term.is_a?(RDF::Node) || term == identifier
+          next if !term.is_a?(RDF::Node) || term == node
           log_debug("  - position", pos)
           hash = log_depth(depth: 4) {hash_related_node(term, statement, issuer, pos)}
@@ -374,6 +408,11 @@ module RDF::Normalize
         @issued[node]
       end
+      # @return [Hash{Symbol => Symbol}] the issued identifiers map
+      def to_hash
+        @issued.inject({}) {|memo, (node, canon)| memo.merge(node.id => canon)}
+      end
       # Duplicate this issuer, ensuring that the issued identifiers remain distinct
       # @return [IdentifierIssuer]
       def dup

data/lib/rdf/normalize/urgna2012.rb CHANGED Viewed

@@ -2,10 +2,16 @@ module RDF::Normalize
   class URGNA2012 < RDFC10
     def each(&block)
-      ns = NormalizationState.new(@options)
+      ns = NormalizationState.new(**@options)
       normalize_statements(ns, &block)
     end
+    def to_hash
+      ns = NormalizationState.new(**@options)
+      normalize_statements(ns)
+      ns.canonical_issuer.to_h
+    end
     class NormalizationState < RDFC10::NormalizationState
       protected

data/lib/rdf/normalize.rb CHANGED Viewed

@@ -52,6 +52,10 @@ module RDF
     # @param [Hash{Symbol => Object}] options
     # @option options [Base] :algorithm (:rdfc10)
     #   One of `:carroll2001`, `:urgna2012`, or `:rdfc10`
+    # @option options [Integer] :max_calls
+    #   Maximum number of calls allowed for recursive blank node labeling,
+    #   as a multiple of the total number of blank nodes in the dataset.
+    # @option options [Boolean] :identifier_map
     # @return [RDF::Normalize::Base]
     # @raise [ArgumentError] selected algorithm not defined
     def new(enumerable, **options)

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rdf-normalize
 version: !ruby/object:Gem::Version
-  version: 0.6.0
+  version: 0.6.1
 platform: ruby
 authors:
 - Gregg Kellogg
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-06-10 00:00:00.000000000 Z
+date: 2023-07-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rdf