RubyGems - rdf-turtle - Versions diffs - 0.0.2 - Mend

rdf-turtle 0.0.2

Files changed (16) hide show

@@ -0,0 +1,38 @@
+require 'rdf'
+module RDF
+  class List
+    ##
+    # Validate the list ensuring that
+    # * rdf:rest values are all BNodes are nil
+    # * rdf:type, if it exists, is rdf:List
+    # * each subject has no properties other than single-valued rdf:first, rdf:rest
+    #   other than for the first node in the list
+    # @return [Boolean]
+    def valid?
+      li = subject
+      while li != RDF.nil do
+        rest = nil
+        firsts = rests = 0
+        @graph.query(:subject => li) do |st|
+          case st.predicate
+          when RDF.type
+            # Be tollerant about rdf:type entries, as some OWL vocabularies use it excessively
+          when RDF.first
+            firsts += 1
+          when RDF.rest
+            rest = st.object
+            return false unless rest.node? || rest == RDF.nil
+            rests += 1
+          else
+            # First node may have other properties
+            return false unless li == subject
+          end
+        end
+        return false unless firsts == 1 && rests == 1
+        li = rest
+      end
+      true
+    end
+  end
+end

data/lib/rdf/turtle/reader.rb ADDED

@@ -0,0 +1,362 @@
+require 'rdf/turtle/meta'
+require 'rdf/ll1/parser'
+module RDF::Turtle
+  ##
+  # A parser for the Turtle 2
+  class Reader < RDF::Reader
+    format Format
+    include RDF::Turtle::Meta
+    include RDF::LL1::Parser
+    include RDF::Turtle::Terminals
+    # Terminals passed to lexer. Order matters!
+    terminal(:ANON,                 ANON) do |reader, prod, token, input|
+      input[:resource] = reader.bnode
+    end
+    terminal(:BLANK_NODE_LABEL,     BLANK_NODE_LABEL) do |reader, prod, token, input|
+      input[:resource] = reader.bnode(token.value[2..-1])
+    end
+    terminal(:IRI_REF,              IRI_REF, :unescape => true) do |reader, prod, token, input|
+      input[:resource] = reader.process_iri(token.value[1..-2])
+    end
+    terminal(:DOUBLE,               DOUBLE) do |reader, prod, token, input|
+      input[:resource] = reader.literal(token.value, :datatype => RDF::XSD.double)
+    end
+    terminal(:DOUBLE_NEGATIVE,      DOUBLE_NEGATIVE) do |reader, prod, token, input|
+      input[:resource] = reader.literal(token.value, :datatype => RDF::XSD.double)
+    end
+    terminal(:DOUBLE_POSITIVE,      DOUBLE_POSITIVE) do |reader, prod, token, input|
+      input[:resource] = reader.literal(token.value, :datatype => RDF::XSD.double)
+    end
+    terminal(:DECIMAL,              DECIMAL) do |reader, prod, token, input|
+      input[:resource] = reader.literal(token.value, :datatype => RDF::XSD.decimal)
+    end
+    terminal(:DECIMAL_NEGATIVE,     DECIMAL_NEGATIVE) do |reader, prod, token, input|
+      input[:resource] = reader.literal(token.value, :datatype => RDF::XSD.decimal)
+    end
+    terminal(:DECIMAL_POSITIVE,     DECIMAL_POSITIVE) do |reader, prod, token, input|
+      input[:resource] = reader.literal(token.value, :datatype => RDF::XSD.decimal)
+    end
+    terminal(:INTEGER,              INTEGER) do |reader, prod, token, input|
+      input[:resource] = reader.literal(token.value, :datatype => RDF::XSD.integer)
+    end
+    terminal(:INTEGER_NEGATIVE,     INTEGER_NEGATIVE) do |reader, prod, token, input|
+      input[:resource] = reader.literal(token.value, :datatype => RDF::XSD.integer)
+    end
+    terminal(:INTEGER_POSITIVE,     INTEGER_POSITIVE) do |reader, prod, token, input|
+      input[:resource] = reader.literal(token.value, :datatype => RDF::XSD.integer)
+    end
+    # Spec confusion: spec says : "Literals , prefixed names and IRIs may also contain escape sequences"
+    terminal(:PNAME_LN,             PNAME_LN) do |reader, prod, token, input|
+      prefix, suffix = token.value.split(":", 2)
+      input[:resource] = reader.pname(prefix, suffix)
+    end
+    # Spec confusion: spec says : "Literals , prefixed names and IRIs may also contain escape sequences"
+    terminal(:PNAME_NS,             PNAME_NS) do |reader, prod, token, input|
+      prefix = token.value[0..-2]
+      # Two contexts, one when prefix is being defined, the other when being used
+      case prod
+      when :prefixID
+        input[:prefix] = prefix
+      else
+        input[:resource] = reader.pname(prefix, '')
+      end
+    end
+    terminal(:STRING_LITERAL_LONG1, STRING_LITERAL_LONG1, :unescape => true) do |reader, prod, token, input|
+      input[:string_value] = token.value[3..-4]
+    end
+    terminal(:STRING_LITERAL_LONG2, STRING_LITERAL_LONG2, :unescape => true) do |reader, prod, token, input|
+      input[:string_value] = token.value[3..-4]
+    end
+    terminal(:STRING_LITERAL1,      STRING_LITERAL1, :unescape => true) do |reader, prod, token, input|
+      input[:string_value] = token.value[1..-2]
+    end
+    terminal(:STRING_LITERAL2,      STRING_LITERAL2, :unescape => true) do |reader, prod, token, input|
+      input[:string_value] = token.value[1..-2]
+    end
+    # String terminals
+    terminal(nil,                  %r([\(\),.;\[\]a]|\^\^|@base|@prefix|true|false)) do |reader, prod, token, input|
+      case token.value
+      when 'a'             then input[:resource] = RDF.type
+      when 'true', 'false' then input[:resource] = RDF::Literal::Boolean.new(token.value)
+      else                      input[:string] = token.value
+      end
+    end
+    terminal(:LANGTAG,              LANGTAG) do |reader, prod, token, input|
+      input[:lang] = token.value[1..-1]
+    end
+    # Productions
+    # [4] prefixID defines a prefix mapping
+    production(:prefixID) do |reader, phase, input, current, callback|
+      next unless phase == :finish
+      prefix = current[:prefix]
+      iri = current[:resource]
+      callback.call(:trace, "prefixID", "Defined prefix #{prefix.inspect} mapping to #{iri.inspect}")
+      reader.prefix(prefix, iri)
+    end
+    # [5] base set base_uri
+    production(:base) do |reader, phase, input, current, callback|
+      next unless phase == :finish
+      iri = current[:resource]
+      callback.call(:trace, "base", "Defined base as #{iri}")
+      reader.options[:base_uri] = iri
+    end
+    # [9] verb ::= predicate | "a"
+    production(:verb) do |reader, phase, input, current, callback|
+      next unless phase == :finish
+      input[:predicate] = current[:resource] if phase == :finish
+    end
+    # [10] subject ::= IRIref | blank
+    production(:subject) do |reader, phase, input, current, callback|
+      next unless phase == :finish
+      input[:subject] = current[:resource] if phase == :finish
+    end
+    # [12] object ::= IRIref | blank | literal
+    production(:object) do |reader, phase, input, current, callback|
+      next unless phase == :finish
+      if input[:object_list]
+        # Part of an rdf:List collection
+        input[:object_list] << current[:resource]
+      else
+        callback.call(:trace, "object", "current: #{current.inspect}")
+        callback.call(:statement, "object", input[:subject], input[:predicate], current[:resource])
+      end
+    end
+    # [15] blankNodePropertyList ::= "[" predicateObjectList "]"
+    production(:blankNodePropertyList) do |reader, phase, input, current, callback|
+      if phase == :start
+        current[:subject] = reader.bnode
+      else
+        input[:resource] = current[:subject]
+      end
+    end
+    # [16] collection ::= "(" object* ")"
+    production(:collection) do |reader, phase, input, current, callback|
+      if phase == :start
+        current[:object_list] = []  # Tells the object production to collect and not generate statements
+      else
+        # Create an RDF list
+        bnode = reader.bnode
+        objects = current[:object_list]
+        list = RDF::List.new(bnode, nil, objects)
+        list.each_statement do |statement|
+          # Spec Confusion, referenced section "Collection" is missing from the spec.
+          # Anicdodal evidence indicates that some expect each node to be of type rdf:list,
+          # but existing Notation3 and Turtle tests (http://www.w3.org/2001/sw/DataAccess/df1/tests/manifest.ttl) do not.
+          next if statement.predicate == RDF.type && statement.object == RDF.List
+          callback.call(:statement, "collection", statement.subject, statement.predicate, statement.object)
+        end
+        bnode = RDF.nil if list.empty?
+        # Return bnode as resource
+        input[:resource] = bnode
+      end
+    end
+    # [60s] RDFLiteral ::= String ( LANGTAG | ( "^^" IRIref ) )?
+    production(:RDFLiteral) do |reader, phase, input, current, callback|
+      next unless phase == :finish
+      opts = {}
+      opts[:datatype] = current[:resource] if current[:resource]
+      opts[:language] = current[:lang] if current[:lang]
+      input[:resource] = reader.literal(current[:string_value], opts)
+    end
+    ##
+    # Missing in 0.3.2
+    def base_uri
+      @options[:base_uri]
+    end
+    ##
+    # Initializes a new parser instance.
+    #
+    # @param  [String, #to_s]          input
+    # @param  [Hash{Symbol => Object}] options
+    # @option options [Hash]     :prefixes     (Hash.new)
+    #   the prefix mappings to use (for acessing intermediate parser productions)
+    # @option options [#to_s]    :base_uri     (nil)
+    #   the base URI to use when resolving relative URIs (for acessing intermediate parser productions)
+    # @option options [#to_s]    :anon_base     ("b0")
+    #   Basis for generating anonymous Nodes
+    # @option options [Boolean] :resolve_uris (false)
+    #   Resolve prefix and relative IRIs, otherwise, when serializing the parsed SSE
+    #   as S-Expressions, use the original prefixed and relative URIs along with `base` and `prefix`
+    #   definitions.
+    # @option options [Boolean]  :validate     (false)
+    #   whether to validate the parsed statements and values. If not validating,
+    #   the parser will attempt to recover from errors.
+    # @option options [Boolean] :progress
+    #   Show progress of parser productions
+    # @option options [Boolean] :debug
+    #   Detailed debug output
+    # @return [RDF::Turtle::Reader]
+    def initialize(input = nil, options = {}, &block)
+      super do
+        @options = {:anon_base => "b0", :validate => false}.merge(options)
+        debug("def prefix", "#{base_uri.inspect}")
+        debug("validate", "#{validate?.inspect}")
+        debug("canonicalize", "#{canonicalize?.inspect}")
+        debug("intern", "#{intern?.inspect}")
+        if block_given?
+          case block.arity
+            when 0 then instance_eval(&block)
+            else block.call(self)
+          end
+        end
+      end
+    end
+    def inspect
+      sprintf("#<%s:%#0x(%s)>", self.class.name, __id__, base_uri.to_s)
+    end
+    ##
+    # Iterates the given block for each RDF statement in the input.
+    #
+    # @yield  [statement]
+    # @yieldparam [RDF::Statement] statement
+    # @return [void]
+    def each_statement(&block)
+      @callback = block
+      parse(@input, START.to_sym, @options.merge(:branch => BRANCH, :follow => FOLLOW)) do |context, *data|
+        case context
+        when :statement
+          add_triple(*data)
+        when :trace
+          debug(*data)
+        end
+      end
+    rescue RDF::LL1::Parser::Error => e
+      error("each_statement", e.message, :backtrace => e.backtrace)
+    end
+    ##
+    # Iterates the given block for each RDF triple in the input.
+    #
+    # @yield  [subject, predicate, object]
+    # @yieldparam [RDF::Resource] subject
+    # @yieldparam [RDF::URI]      predicate
+    # @yieldparam [RDF::Value]    object
+    # @return [void]
+    def each_triple(&block)
+      each_statement do |statement|
+        block.call(*statement.to_triple)
+      end
+    end
+    # add a statement, object can be literal or URI or bnode
+    #
+    # @param [Nokogiri::XML::Node, any] node:: XML Node or string for showing context
+    # @param [URI, Node] subject:: the subject of the statement
+    # @param [URI] predicate:: the predicate of the statement
+    # @param [URI, Node, Literal] object:: the object of the statement
+    # @return [Statement]:: Added statement
+    # @raise [RDF::ReaderError]:: Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
+    def add_triple(node, subject, predicate, object)
+      statement = RDF::Statement.new(subject, predicate, object)
+      if statement.valid?
+        debug(node, "generate statement: #{statement}")
+        @callback.call(statement)
+      else
+        error(node, "Statement is invalid: #{statement.inspect}")
+      end
+    end
+    def process_iri(iri)
+      iri(base_uri, iri)
+    end
+    # Create IRIs
+    def iri(value, append = nil)
+      value = RDF::URI.new(value)
+      value = value.join(append) if append
+      value.validate! if validate? && value.respond_to?(:validate)
+      value.canonicalize! if canonicalize?
+      value = RDF::URI.intern(value) if intern?
+      value
+    end
+    # Create a literal
+    def literal(value, options = {})
+      options = options.dup
+      # Internal representation is to not use xsd:string, although it could arguably go the other way.
+      options.delete(:datatype) if options[:datatype] == RDF::XSD.string
+      debug("literal", "value: #{value.inspect}, options: #{options.inspect}, validate: #{validate?.inspect}, c14n?: #{canonicalize?.inspect}")
+      RDF::Literal.new(value, options.merge(:validate => validate?, :canonicalize => canonicalize?))
+    end
+    ##
+    # Override #prefix to take a relative IRI
+    #
+    # prefix directives map a local name to an IRI, also resolved against the current In-Scope Base URI.
+    # Spec confusion, presume that an undefined empty prefix has an empty relative IRI, which uses
+    # string contatnation rules against the in-scope IRI at the time of use
+    def prefix(prefix, iri = nil)
+      debug("prefix", "'#{prefix}' <#{iri}>")
+      # Relative IRIs are resolved against @base
+      iri = process_iri(iri) if iri
+      super(prefix, iri)
+    end
+    ##
+    # Expand a PNAME using string concatenation
+    def pname(prefix, suffix)
+      # Prefixes must be defined, except special case for empty prefix being alias for current @base
+      if prefix(prefix)
+        base = prefix(prefix).to_s
+      elsif prefix.to_s.empty?
+        base = base_uri.to_s
+      else
+        error("pname", "undefined prefix #{prefix.inspect}") unless prefix(prefix) || prefix.to_s.empty?
+      end
+      suffix = suffix.to_s.sub(/^\#/, "") if base.index("#")
+      debug("pname", "base: '#{base}', suffix: '#{suffix}'")
+      iri(base + suffix.to_s)
+    end
+    # Keep track of allocated BNodes
+    def bnode(value = nil)
+      return RDF::Node.new unless value
+      @bnode_cache ||= {}
+      @bnode_cache[value.to_s] ||= RDF::Node.new(value)
+    end
+    # @param [String] str Error string
+    # @param [Hash] options
+    # @option options [URI, #to_s] :production
+    # @option options [Token] :token
+    def error(node, message, options = {})
+      if !@options[:validate] && !options[:fatal]
+        debug(node, message, options)
+      else
+        raise RDF::ReaderError, message, options[:backtrace]
+      end
+    end
+    ##
+    # Progress output when debugging
+    # @param [String] str
+    def debug(node, message, options = {})
+      depth = options[:depth] || self.depth
+      str = "[#{@lineno}]#{' ' * depth}#{node}: #{message}"
+      @options[:debug] << str if @options[:debug].is_a?(Array)
+      $stderr.puts(str) if RDF::Turtle.debug?
+    end
+  end # class Reader
+end # module RDF::Turtle

data/lib/rdf/turtle/terminals.rb ADDED

@@ -0,0 +1,88 @@
+require 'rdf/ll1/lexer'
+module RDF::Turtle
+  module Terminals
+    # Definitions of token regular expressions used for lexical analysis
+    if RUBY_VERSION >= '1.9'
+      ##
+      # Unicode regular expressions for Ruby 1.9+ with the Oniguruma engine.
+      U_CHARS1         = Regexp.compile(<<-EOS.gsub(/\s+/, ''))
+                           [\\u00C0-\\u00D6]|[\\u00D8-\\u00F6]|[\\u00F8-\\u02FF]|
+                           [\\u0370-\\u037D]|[\\u037F-\\u1FFF]|[\\u200C-\\u200D]|
+                           [\\u2070-\\u218F]|[\\u2C00-\\u2FEF]|[\\u3001-\\uD7FF]|
+                           [\\uF900-\\uFDCF]|[\\uFDF0-\\uFFFD]|[\\u{10000}-\\u{EFFFF}]
+                         EOS
+      U_CHARS2         = Regexp.compile("\\u00B7|[\\u0300-\\u036F]|[\\u203F-\\u2040]")
+    else
+      ##
+      # UTF-8 regular expressions for Ruby 1.8.x.
+      U_CHARS1         = Regexp.compile(<<-EOS.gsub(/\s+/, ''))
+                           \\xC3[\\x80-\\x96]|                                (?# [\\u00C0-\\u00D6]|)
+                           \\xC3[\\x98-\\xB6]|                                (?# [\\u00D8-\\u00F6]|)
+                           \\xC3[\\xB8-\\xBF]|[\\xC4-\\xCB][\\x80-\\xBF]|     (?# [\\u00F8-\\u02FF]|)
+                           \\xCD[\\xB0-\\xBD]|                                (?# [\\u0370-\\u037D]|)
+                           \\xCD\\xBF|[\\xCE-\\xDF][\\x80-\\xBF]|             (?# [\\u037F-\\u1FFF]|)
+                           \\xE0[\\xA0-\\xBF][\\x80-\\xBF]|                   (?# ...)
+                           \\xE1[\\x80-\\xBF][\\x80-\\xBF]|                   (?# ...)
+                           \\xE2\\x80[\\x8C-\\x8D]|                           (?# [\\u200C-\\u200D]|)
+                           \\xE2\\x81[\\xB0-\\xBF]|                           (?# [\\u2070-\\u218F]|)
+                           \\xE2[\\x82-\\x85][\\x80-\\xBF]|                   (?# ...)
+                           \\xE2\\x86[\\x80-\\x8F]|                           (?# ...)
+                           \\xE2[\\xB0-\\xBE][\\x80-\\xBF]|                   (?# [\\u2C00-\\u2FEF]|)
+                           \\xE2\\xBF[\\x80-\\xAF]|                           (?# ...)
+                           \\xE3\\x80[\\x81-\\xBF]|                           (?# [\\u3001-\\uD7FF]|)
+                           \\xE3[\\x81-\\xBF][\\x80-\\xBF]|                   (?# ...)
+                           [\\xE4-\\xEC][\\x80-\\xBF][\\x80-\\xBF]|           (?# ...)
+                           \\xED[\\x80-\\x9F][\\x80-\\xBF]|                   (?# ...)
+                           \\xEF[\\xA4-\\xB6][\\x80-\\xBF]|                   (?# [\\uF900-\\uFDCF]|)
+                           \\xEF\\xB7[\\x80-\\x8F]|                           (?# ...)
+                           \\xEF\\xB7[\\xB0-\\xBF]|                           (?# [\\uFDF0-\\uFFFD]|)
+                           \\xEF[\\xB8-\\xBE][\\x80-\\xBF]|                   (?# ...)
+                           \\xEF\\xBF[\\x80-\\xBD]|                           (?# ...)
+                           \\xF0[\\x90-\\xBF][\\x80-\\xBF][\\x80-\\xBF]|      (?# [\\u{10000}-\\u{EFFFF}])
+                           [\\xF1-\\xF2][\\x80-\\xBF][\\x80-\\xBF][\\x80-\\xBF]|
+                           \\xF3[\\x80-\\xAF][\\x80-\\xBF][\\x80-\\xBF]       (?# ...)
+                         EOS
+      U_CHARS2         = Regexp.compile(<<-EOS.gsub(/\s+/, ''))
+                           \\xC2\\xB7|                                        (?# \\u00B7|)
+                           \\xCC[\\x80-\\xBF]|\\xCD[\\x80-\\xAF]|             (?# [\\u0300-\\u036F]|)
+                           \\xE2\\x80\\xBF|\\xE2\\x81\\x80                    (?# [\\u203F-\\u2040])
+                         EOS
+    end
+    UCHAR                = RDF::LL1::Lexer::UCHAR
+    WS                   = / |\t|\r|\n  /                                         # [93s]
+    PN_CHARS_BASE        = /[A-Z]|[a-z]|#{U_CHARS1}|#{UCHAR}/                     # [95s]
+    PN_CHARS_U           = /_|#{PN_CHARS_BASE}/                                   # [96s]
+    PN_CHARS             = /-|[0-9]|#{PN_CHARS_U}|#{U_CHARS2}/                    # [98s]
+    PN_CHARS_BODY        = /(?:(?:\.|#{PN_CHARS})*#{PN_CHARS})?/
+    PN_LOCAL             = /(?:[0-9]|#{PN_CHARS_U})#{PN_CHARS_BODY}/              # [100s]
+    EXPONENT             = /[eE][+-]?[0-9]+/                                      # [86s]
+    ANON                 = /\[#{WS}*\]/                                           # [94s]
+    BLANK_NODE_LABEL     = /_:#{PN_LOCAL}/                                        # [73s]
+    DECIMAL              = /(?:[0-9]+\.[0-9]*|\.[0-9]+)/                          # [78s]
+    DECIMAL_NEGATIVE     = /\-(?:[0-9]+\.[0-9]*|\.[0-9]+)/                        # [83s]
+    DECIMAL_POSITIVE     = /\+(?:[0-9]+\.[0-9]*|\.[0-9]+)/                        # [81s]
+    DOUBLE               = /(?:[0-9]+\.[0-9]*|\.[0-9]+|[0-9]+)#{EXPONENT}/        # [79s]
+    DOUBLE_NEGATIVE      = /\-(?:[0-9]+\.[0-9]*|\.[0-9]+|[0-9]+)#{EXPONENT}/      # [79s]
+    DOUBLE_POSITIVE      = /\+(?:[0-9]+\.[0-9]*|\.[0-9]+|[0-9]+)#{EXPONENT}/      # [79s]
+    ECHAR                = /\\[tbnrf\\"']/                                        # [91s]
+    INTEGER              = /[0-9]+/                                               # [77s]
+    INTEGER_NEGATIVE     = /\-[0-9]+/                                             # [83s]
+    INTEGER_POSITIVE     = /\+[0-9]+/                                             # [80s]
+    # Spec confusion: the EBNF definition of IRI_REF seems malformed, and has no
+    # provision for \^, as discussed elsewhere in the spec.
+    IRI_REF              = /<(?:[^<>"{}|^`\\\x00-\x20]|#{U_CHARS1})*>/            # [70s]
+    LANGTAG              = /@[a-zA-Z]+(?:-[a-zA-Z0-9]+)*/                         # [76s]
+    PN_PREFIX            = /#{PN_CHARS_BASE}#{PN_CHARS_BODY}/                     # [99s]
+    PNAME_NS             = /#{PN_PREFIX}?:/                                       # [71s]
+    PNAME_LN             = /#{PNAME_NS}#{PN_LOCAL}/                               # [72s]
+    STRING_LITERAL1      = /'(?:[^\'\\\n\r]|#{ECHAR}|#{UCHAR})*'/                 # [87s]
+    STRING_LITERAL2      = /"(?:[^\"\\\n\r]|#{ECHAR}|#{UCHAR})*"/                 # [88s]
+    STRING_LITERAL_LONG1 = /'''(?:(?:'|'')?(?:[^'\\]|#{ECHAR}|#{UCHAR}))*'''/m    # [89s]
+    STRING_LITERAL_LONG2 = /"""(?:(?:"|"")?(?:[^"\\]|#{ECHAR}|#{UCHAR}))*"""/m    # [90s]
+  end
+end